# !pip install gymnasium stable-baselines3 numpy matplotlib

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

np.random.seed(42)

env_cart = gym.make('CartPole-v1')
env_walk = gym.make('BipedalWalker-v3')
print('CartPole obs/act dims:', env_cart.observation_space.shape, env_cart.action_space.n)
print('BipedalWalker obs/act dims:', env_walk.observation_space.shape, env_walk.action_space.shape)

model = PPO('MlpPolicy', env_cart, verbose=0)
model.learn(total_timesteps=50_000)
mean, std = evaluate_policy(model, env_cart, n_eval_episodes=20)
print(f'CartPole: {mean:.1f} +/- {std:.1f}')

# YOUR TURN
# Train PPO on BipedalWalker for 500_000 steps. Tune lr, n_steps, ent_coef.
# Report mean reward over 20 eval episodes.

# YOUR TURN
# On BipedalWalker, sweep ent_coef in {0.0, 0.01, 0.1}. Plot learning curves.
# Write a 200-word memo on which mattered most and why.

Lab 3 — PPO on CartPole and BipedalWalker¶

Setup¶

Environments¶

Exercise 1 — PPO on CartPole¶

Exercise 2 — PPO on BipedalWalker¶

Exercise 3 — Hyperparameter sensitivity¶

Done?¶