# !pip install numpy matplotlib

import numpy as np
import matplotlib.pyplot as plt

rng = np.random.default_rng(42)

# State: current stock in {0, ..., 49}. Action: order quantity in {0, ..., 49-s}.
# Demand: Poisson(lambda=3). Holding cost 0.5/unit/period. Stockout cost 5/unit.
# Selling price 4/unit. Ordering cost 2/unit + fixed 10 if order > 0.

S = 50
GAMMA = 0.95
DEMAND_LAMBDA = 3
HOLD = 0.5
STOCKOUT = 5.0
PRICE = 4.0
ORDER_VAR = 2.0
ORDER_FIX = 10.0

from scipy.stats import poisson
demand_pmf = poisson.pmf(np.arange(S+1), DEMAND_LAMBDA)
demand_pmf[-1] = 1.0 - demand_pmf[:-1].sum()
print('demand pmf sum:', demand_pmf.sum())

# YOUR TURN
# For each (state s, action a), compute:
# - expected immediate reward r(s, a)
# - transition probabilities P(s' | s, a)

# YOUR TURN
# Initialize V = 0. Iterate V_{k+1}(s) = max_a [r(s, a) + gamma * sum_{s'} P(s'|s,a) V_k(s')]
# until ||V_{k+1} - V_k||_inf < 1e-6. Plot V and the policy.

# YOUR TURN
# Simulate both the optimal policy and an order-up-to-S heuristic for 1000 episodes.
# Report mean total reward, standard error, and the gap.

Lab 1 — Value iteration on a 50-state inventory MDP¶

Setup¶

Inventory MDP setup¶

Exercise 1 — Build the reward and transition tables¶

Exercise 2 — Value iteration¶

Exercise 3 — Compare against (s, S) heuristic¶

Done?¶