In [None]:
# @title Introduction
# @markdown This notbook is designed to run on Google Colab.
# @markdown We provide APIs for you to train/test your agent in a volleyball tournament.

# @markdown If this is your first time running the notebook, you will be required to restart the runtime **after this cell is run**.


import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from IPython.display import clear_output, HTML
import imageio
from base64 import b64encode

import warnings


def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

try:
    import slimevolleygym
    import gym
except ModuleNotFoundError:
    if is_colab():
        # SlimeVolley depends on gym<=0.21.0 for display,
        # we cannot install gym<=0.21.0 unless we downgrad setuptools and wheel.
        # Also, we are going to need opengl to render the screen
        !apt-get install -y xvfb python3-opengl x11-utils
        !pip install git+https://github.com/hardmaru/slimevolleygym.git
        !pip install setuptools==65.5.0 "wheel<0.40.0"
        !pip install pip==24.0.0
        !pip install gym==0.19.0
        !pip install pyglet==1.5.11
        !pip install pyvirtualdisplay==0.2.5
        clear_output()
        import slimevolleygym
        import gym
    else:
        warnings.warn('You are running the code in a local env.' +
                      'Please make sure you create a separate conda/venv environment and install the packages listed above.')

from multiprocessing import Pool, cpu_count
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()
clear_output()

In [None]:
# @title Know the game
# @markdown In the animation, the agent on the left is controlled by a pre-trained neural network policy,
# @markdown and the agent on the right is controlled by a random policy.


def play_video(image_list, fps=30):
    output_video = '/tmp/temp_video.mp4'
    with imageio.get_writer(output_video, fps=fps) as writer:
        for img in image_list:
            writer.append_data(img)
    mp4 = open(output_video,'rb').read()
    data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()
    return HTML("""
    <video width=400 controls loop>
        <source src="%s" type="video/mp4">
    </video>
    """ % data_url)


def rand_control(seed):
    env = gym.make('SlimeVolley-v0')
    env.seed(seed)
    obs = env.reset()
    total_reward = 0
    done = False
    imgs = []
    step = 0
    while not done:
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        total_reward += reward
        if step % 3 == 0:
            imgs.append(env.render('rgb_array'))
        step += 1
    env.close()
    print(f'reward={total_reward}')
    return imgs


seed = 42
images = rand_control(seed)
play_video(images)



reward=-5


In [None]:
# @title Know the game API (READ ME)

# @markdown In this game, the signature of `env.step()` is `def step(self, action, otherAction=None)`.
# @markdown By giving both `action` (which controls the action of the right player) and `otherAction` (which controls the left player),
# @markdown a match between 2 policies is possible. The agents' observations are in the return values of `env.step()`.

# @markdown This code cell provides a walk-through of the game environment API.
# @markdown Refer to the [code here](https://github.com/hardmaru/slimevolleygym/blob/master/slimevolleygym/slimevolley.py) for more details.
# @markdown After learning the code here, you should be able to train/test your agent(s).


# Initialize the game environment.
env = gym.make('SlimeVolley-v0')
env.multiagent = True  # Enable multi-player.

env.seed(seed)
env.reset()
render = True
done = False
imgs = []
step = 0

while not done:

    # Randomly sample actions for both actions.
    # The valid actions are:
    #   [0, 0, 0]   - NOOP
    #   [1, 0, 0]   - LEFT (forward)
    #   [1, 0, 1]   - UPLEFT (forward jump)
    #   [0, 0, 1]   - UP (jump)
    #   [0, 1, 1]   - UPRIGHT (backward jump)
    #   [0, 1, 0]   - RIGHT (backward)
    #
    # You should implement more sophisticated neural network policies for this.
    action_left = env.action_space.sample()
    action_right = env.action_space.sample()

    # Apply the actions and get observations.
    # The observations include:
    #   obs[0:2]    - player position (x, y)
    #   obs[2:4]    - player velocity (vx, vy)
    #   obs[4:6]    - ball position (bx, by)
    #   obs[6:8]    - ball velocity (bvx, bvy)
    #   obs[8:10]   - opponent position (ox, oy)
    #   obs[10:12]  - opponent velocity (vox, voy)
    # Reward (w.r.t the right player) is sparse:
    #   1   - win
    #   -1  - loss
    #   0   - otherwise
    obs_right, reward, done, info = env.step(action_right, action_left)
    obs_left = info['otherObs']

    # Render the game environment.
    # `render` should be turned off during training and evaluation.
    if render and step % 3 == 0:
        imgs.append(env.render('rgb_array'))
    step += 1

env.close()
play_video(imgs)



In [None]:
# @title Know the tournamement API (READ ME)

# @markdown In your code, you are expected to implement `get_action`,
# @markdown and it will be called in the following manner.

import importlib


def load_agent(module_name):
    return importlib.import_module(module_name)


def get_action(module, obs):
    return module.get_action(obs)  # Implement this in your code.


def tournament(left_player_name, right_player_name, render=False):

    # Load the modules.
    left_player = load_agent(left_player_name)
    right_player = load_agent(right_player_name)

    # Initialize the game.
    env = gym.make('SlimeVolley-v0')
    env.multiagent = True
    env.reset()
    obs, _, _, info = env.step([0, 0, 0], [0, 0, 0])
    right_obs = obs
    left_obs = info['otherObs']

    # Play the game.
    done = False
    left_reward = 0
    right_reward = 0
    steps = 0
    imgs = []
    while not done:
        left_action = get_action(left_player, left_obs)
        right_action = get_action(right_player, right_obs)
        obs, reward, done, info = env.step(right_action, left_action)
        right_obs = obs
        left_obs = info['otherObs']
        right_reward += reward
        left_reward -= reward
        steps += 1
        if render and steps % 3 == 0:
            imgs.append(env.render('rgb_array'))

    env.close()

    return left_reward, right_reward, imgs

In [None]:
# @title Example implementation (READ ME)

# @markdown We illustrate how the tournament works with random control policies.

agent_code = """
import numpy as np

actions = np.array([
    [0, 0, 0],
    [1, 0, 0],
    [1, 0, 1],
    [0, 0, 1],
    [0, 1, 1],
    [0, 1, 0],
])

def get_action(obs):
    return actions[np.random.randint(0, len(actions))]
"""

left_agent_name = 'rand_left'
right_agent_name = 'rand_right'
for agent_name in [left_agent_name, right_agent_name]:
    with open(f'{agent_name}.py', 'w') as f:
        f.write(agent_code)


left_reward, right_reward, imgs = tournament(
    left_agent_name, right_agent_name, render=True)

print(f'left_reward={left_reward}')
print(f'right_reward={right_reward}')

if left_reward > right_reward:
    print(f'The winner is {left_agent_name}')
elif left_reward < right_reward:
    print(f'The winner is {right_agent_name}')
else:
    print('It is a draw.')
play_video(imgs)



left_reward=-2
right_reward=2
The winner is rand_right


In [None]:
# @title Another implementation example (READ ME)

# @markdown We illustrate how the tournament works with one random control policy and one pre-trained policy.


pretrained_agent_code = """
import numpy as np

class BaselinePolicy:

  def __init__(self):
    self.nGameInput = 8 # 8 states for agent
    self.nGameOutput = 3 # 3 buttons (forward, backward, jump)
    self.nRecurrentState = 4 # extra recurrent states for feedback.

    self.nOutput = self.nGameOutput+self.nRecurrentState
    self.nInput = self.nGameInput+self.nOutput

    # store current inputs and outputs
    self.inputState = np.zeros(self.nInput)
    self.outputState = np.zeros(self.nOutput)
    self.prevOutputState = np.zeros(self.nOutput)

    self.weight = np.array(
      [7.5719, 4.4285, 2.2716, -0.3598, -7.8189, -2.5422, -3.2034, 0.3935, 1.2202, -0.49, -0.0316, 0.5221, 0.7026, 0.4179, -2.1689,
       1.646, -13.3639, 1.5151, 1.1175, -5.3561, 5.0442, 0.8451, 0.3987, -2.9501, -3.7811, -5.8994, 6.4167, 2.5014, 7.338, -2.9887,
       2.4586, 13.4191, 2.7395, -3.9708, 1.6548, -2.7554, -1.5345, -6.4708, 9.2426, -0.7392, 0.4452, 1.8828, -2.6277, -10.851, -3.2353,
       -4.4653, -3.1153, -1.3707, 7.318, 16.0902, 1.4686, 7.0391, 1.7765, -1.155, 2.6697, -8.8877, 1.1958, -3.2839, -5.4425, 1.6809,
       7.6812, -2.4732, 1.738, 0.3781, 0.8718, 2.5886, 1.6911, 1.2953, -9.0052, -4.6038, -6.7447, -2.5528, 0.4391, -4.9278, -3.6695,
       -4.8673, -1.6035, 1.5011, -5.6124, 4.9747, 1.8998, 3.0359, 6.2983, -4.8568, -2.1888, -4.1143, -3.9874, -0.0459, 4.7134, 2.8952,
       -9.3627, -4.685, 0.3601, -1.3699, 9.7294, 11.5596, 0.1918, 3.0783, 0.0329, -0.1362, -0.1188, -0.7579, 0.3278, -0.977, -0.9377])

    self.bias = np.array([2.2935,-2.0353,-1.7786,5.4567,-3.6368,3.4996,-0.0685])

    # unflatten weight, convert it into 7x15 matrix.
    self.weight = self.weight.reshape(self.nGameOutput+self.nRecurrentState,
      self.nGameInput+self.nGameOutput+self.nRecurrentState)

  def reset(self):
    self.inputState = np.zeros(self.nInput)
    self.outputState = np.zeros(self.nOutput)
    self.prevOutputState = np.zeros(self.nOutput)

  def _forward(self):
    self.prevOutputState = self.outputState
    self.outputState = np.tanh(np.dot(self.weight, self.inputState)+self.bias)

  def _setInputState(self, obs):
    # obs is: (op is opponent). obs is also from perspective of the agent (x values negated for other agent)
    [x, y, vx, vy, ball_x, ball_y, ball_vx, ball_vy, op_x, op_y, op_vx, op_vy] = obs
    self.inputState[0:self.nGameInput] = np.array([x, y, vx, vy, ball_x, ball_y, ball_vx, ball_vy])
    self.inputState[self.nGameInput:] = self.outputState

  def _getAction(self):
    forward = 0
    backward = 0
    jump = 0
    if (self.outputState[0] > 0.75):
      forward = 1
    if (self.outputState[1] > 0.75):
      backward = 1
    if (self.outputState[2] > 0.75):
      jump = 1
    return [forward, backward, jump]

  def predict(self, obs):
    self._setInputState(obs)
    self._forward()
    return self._getAction()


policy = BaselinePolicy()


def get_action(obs):
    return policy.predict(obs)
"""

left_agent_name = 'pretrained_left'
right_agent_name = 'pretrained_right'
for agent_name in [left_agent_name, right_agent_name]:
    with open(f'{agent_name}.py', 'w') as f:
        f.write(pretrained_agent_code)


left_reward, right_reward, imgs = tournament(
    left_agent_name, right_agent_name, render=True)

print(f'left_reward={left_reward}')
print(f'right_reward={right_reward}')

if left_reward > right_reward:
    print(f'The winner is {left_agent_name}')
elif left_reward < right_reward:
    print(f'The winner is {right_agent_name}')
else:
    print('It is a draw.')
play_video(imgs)




left_reward=1
right_reward=-1
The winner is pretrained_left


# Step 1. Play against a random agent

From the past homeworks, you should have learned (1) how to build an MLP in Numpy or Jax (2) how to evolve the MLP with evolutionary algorithms such as CMA-ES. Can you learn a simple MLP to beat a randomly controlled agent?
Use the following code as a template to start.

Hint: if the rewards are too sparse for the algorithm to learn, consdier turning on the survival bonus reward to make it dense (see more [here](https://github.com/hardmaru/slimevolleygym/blob/master/slimevolleygym/slimevolley.py)).

In [None]:
from multiprocessing import Pool, cpu_count


def mlp_forward(param, obs):
    """Implement a simple MLP here.

    param   - The parameters of the MLP.
    obs     - The agent's observation.
    return  - The agent's action.
    """
    # Your code here.


def eval_param(param):

    # Initialize the game.
    env = gym.make('SlimeVolley-v0')
    env.multiagent = True
    env.reset()
    obs, _, _, info = env.step([0, 0, 0], [0, 0, 0])
    right_obs = obs
    left_obs = info['otherObs']

    # Play the game.
    done = False
    left_reward = 0
    right_reward = 0
    while not done:
        left_action = env.action_space.sample()
        right_action = mlp_forward(param, right_obs)
        obs, reward, done, info = env.step(right_action, left_action)
        right_obs = obs
        left_obs = info['otherObs']
        right_reward += reward
        left_reward -= reward
    env.close()
    return right_reward


solver = # Your code here, implement this ES algorithm

# Warning: This can take some time!
num_gens = 300
with Pool(cpu_count()) as pool:
    for gen in range(num_gens):  # Main loop
        params = solver.ask()
        rewards = pool.map(eval_param, params)
        solver.tell(params, rewards)
        print(f'gen={gen}, best_reward={np.max(rewards)}')

Playing against the random agent is easy, you should achieve an average reward of 5. You can export `mlp_forward` and the learned weights into a python file and load that in the `tournament()` API to test and visualize the controller.

# Step 2: Self-play

AlphaGo learned to master Go by self-play. It is also possible to do so in this volleyball game. Can you change the `eval_param()` function above so that it accepts 2 sets of parameters and allow these 2 agents to play with each other? You should also change the main loop above to report properly the scores to the solver.

In [None]:
def eval_param2(args):
    """Self-play.

    param1  - Parameters for agent 1.
    param2  - Parameters for agent 2.
    """
    param1, param2 = args

    # Your code here


solver = # Your code here, implement this ES algorithm

# Warning: This can take some time!
num_gens = 300
with Pool(cpu_count()) as pool:
    for gen in range(num_gens):  # Main loop
        params = solver.ask()

        # Your code here

        solver.tell(params, rewards)
        print(f'gen={gen}, best_reward={np.max(rewards)}')