1. Python implementation of the Value Iteration algorithm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def value_iteration(env, gamma=0.99, theta=1e-6):
    """
    Performs value iteration to compute an optimal policy.

    Args:
        env: An environment that provides:
            - env.states: a list of all states
            - env.actions(s): a list of actions available in state s
            - env.transitions(s, a): a list of (probability, next_state, reward) tuples
            - env.is_terminal(s): a function to check if a state is terminal
        gamma: Discount factor.
        theta: Small threshold for convergence.

    Returns:
        A tuple (V, pi) where:
            - V is a dict of state -> value
            - pi is a dict of state -> optimal action
    """
    V = {s: 0.0 for s in env.states}
    history = {s: [0] for s in env.states}
    for s in env.states:
        if env.is_terminal(s):
            V[s] = 0.0  # terminal states have zero value

    while True:
        delta = 0
        for s in env.states:
            if env.is_terminal(s):
                continue
            v = V[s]
            action_values = []
            for a in env.actions(s):
                q = sum(
                    prob * (reward + gamma * V[next_state])
                    for prob, next_state, reward in env.transitions(s, a)
                )
                action_values.append(q)
            V[s] = max(action_values)
            history[s].append(V[s])
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    # Derive the policy
    policy = {}
    for s in env.states:
        if env.is_terminal(s):
            policy[s] = None
            continue
        best_action = max(
            env.actions(s),
            key=lambda a: sum(
                prob * (reward + gamma * V[next_state])
                for prob, next_state, reward in env.transitions(s, a)
            )
        )
        policy[s] = best_action

    return V, policy, history

1.1 Example usage- Environment for Diabetes Care

  • This environment simulates a diabetes care scenario with different states and actions.
  • The states represent the health status of a patient, and the actions represent different care strategies.
  • The transitions define the probabilities of moving from one state to another based on the action taken.
  • The rewards are based on the health outcomes of the actions taken.

class DiabetesCareEnv:
    def __init__(self):
        # all possible states
        self.states = ["stable", "moderate", "critical", "dead"] 
        self._terminal_states = {"dead"}

    def is_terminal(self, s):
        return s in self._terminal_states

    def actions(self, s):
        if s == "dead":
            return []
        return ["lifestyle", "medicate", "intensive_care"]

    def transitions(self, s, a):
        """
        Returns a list of (probability, next_state, reward) tuples.
        Each action from a state leads to all possible states.
        """

        if s == "stable":
            if a == "lifestyle":
                return [
                    (0.80, "stable", 8),
                    (0.10, "moderate", 0),
                    (0.05, "critical", -10),
                    (0.05, "dead", -100),
                ]
            elif a == "medicate":
                return [
                    (0.85, "stable", 6),
                    (0.10, "moderate", -3),
                    (0.03, "critical", -10),
                    (0.02, "dead", -100),
                ]
            elif a == "intensive_care":
                return [
                    (0.90, "stable", -5),
                    (0.05, "moderate", -8),
                    (0.03, "critical", -15),
                    (0.02, "dead", -100),
                ]

        elif s == "moderate":
            if a == "lifestyle":
                return [
                    (0.50, "stable", 5),
                    (0.30, "moderate", -1),
                    (0.15, "critical", -6),
                    (0.05, "dead", -50),
                ]
            elif a == "medicate":
                return [
                    (0.60, "stable", 6),
                    (0.25, "moderate", 0),
                    (0.10, "critical", -5),
                    (0.05, "dead", -50),
                ]
            elif a == "intensive_care":
                return [
                    (0.70, "stable", 3),
                    (0.20, "moderate", -2),
                    (0.05, "critical", -10),
                    (0.05, "dead", -80),
                ]

        elif s == "critical":
            if a == "lifestyle":
                return [
                    (0.05, "stable", 2),
                    (0.10, "moderate", -3),
                    (0.50, "critical", -12),
                    (0.35, "dead", -100),
                ]
            elif a == "medicate":
                return [
                    (0.10, "stable", 3),
                    (0.30, "moderate", -2),
                    (0.50, "critical", -8),
                    (0.20, "dead", -100),
                ]
            elif a == "intensive_care":
                return [
                    (0.30, "stable", 5),
                    (0.30, "moderate", 0),
                    (0.30, "critical", -3),
                    (0.10, "dead", -90),
                ]

        elif s == "dead":
            return [(1.0, "dead", 0)]

        return []
env = DiabetesCareEnv()
V, policy, hist = value_iteration(env, gamma=0.95, theta=1e-4)

for state in env.states:
    print(f"State: {state:10s} | Value: {V[state]:6.2f} | Best Action: {policy[state]}")
State: stable     | Value:  22.73 | Best Action: medicate
State: moderate   | Value:  18.36 | Best Action: medicate
State: critical   | Value:   4.63 | Best Action: intensive_care
State: dead       | Value:   0.00 | Best Action: None
plt.figure(figsize=(12, 8))
for state in env.states:
    if state == "dead":
        continue
    plt.plot(hist[state], label=state)
plt.title("Value Iteration Convergence")
plt.xlabel("Iterations")
plt.ylabel("Value")
plt.legend()
plt.grid()
plt.show()

1.2 Interpretation

  • \(\pi(s)\) = lifestyle: Suggests conservative management is best.
  • \(\pi(s)\) = medicate: Indicates medical intervention is effective and worth the cost.
  • \(\pi(s)\) = intensive_care: Signals high urgency; aggressive action justified.
interp = {
    'state': ['stable', 'moderate', 'critical', 'dead'],
    'best_action': ['lifestyle', 'medicate', 'intensive_care', "none"],
    'interpretation': [
        'Encourage self-care',
        'Best to intervene with medication',
        'Aggressive care needed to avoid death.',
        'No further action needed.'
    ]}
interp_df = pd.DataFrame(interp)
interp_df
state best_action interpretation
0 stable lifestyle Encourage self-care
1 moderate medicate Best to intervene with medication
2 critical intensive_care Aggressive care needed to avoid death.
3 dead none No further action needed.

2. Python implementation Monte Carlo ES (Exploring Starts), for estimating \(\pi \approx \pi_*\)

Alternating complete steps of policy evaluation and policy improvement are performed, beginning with an arbitrary policy \(\pi_0\) and ending with the optimal policy and optimal action-value function:

\[ \pi_0 \xrightarrow E q_{\pi_0} \xrightarrow I \pi_1 \xrightarrow E q_{\pi_1}\xrightarrow I \pi_2,\dots,\xrightarrow I \pi_* \xrightarrow E q_* \]

NoteMonte Carlo ES (Exploring Starts), for estimating \(\pi \approx \pi_*\)

Initialize:
\(\pi(s) \in \mathcal{A}(s)\) arbitrarily, for all \(s \in \mathcal{S}\)
\(Q(s, a) \in \mathbb{R}\) arbitrarily, for all \(s \in \mathcal{S}, a \in \mathcal{A}(s)\)
\(\text{Returns}(s, a) \leftarrow\) empty list, for all \(s \in \mathcal{S}, a \in \mathcal{A}(s)\)

Loop forever (for each episode):
 Choose \(S_0 \in \mathcal{S}, A_0 \in \mathcal{A}(S_0)\) randomly, such that all pairs have probability \(> 0\)
 Generate an episode from \(S_0, A_0\), following \(\pi\):
  \(S_0, A_0, R_1, \dots, S_{T-1}, A_{T-1}, R_T\)
\(G \leftarrow 0\)
 Loop for each step of episode, \(t = T-1, T-2, \dots, 0\):
  \(G \leftarrow \gamma G + R_{t+1}\)
  Unless the pair \((S_t, A_t)\) appears in
   \(S_0, A_0, S_1, A_1, \dots, S_{t-1}, A_{t-1}\):
   Append \(G\) to \(\text{Returns}(S_t, A_t)\)
   \(Q(S_t, A_t) \leftarrow \text{average}(\text{Returns}(S_t, A_t))\)
   \(\pi(S_t) \leftarrow \arg\max_a Q(S_t, a)\)

from collections import defaultdict
import random

def monte_carlo_es(env, gamma=0.95, episodes=5000):
    Q = defaultdict(lambda: defaultdict(float)) # initializes Qs
    Q_history = defaultdict(lambda: defaultdict(list)) # initializes Q history
    Returns = defaultdict(lambda: defaultdict(list)) # initializes returns
    pi = {} 

    for s in env.non_terminal_states():
        pi[s] = random.choice(env.actions(s))

    def generate_episode(s0, a0):
        episode = []
        env.reset(s0)
        state = s0
        action = a0
        done = False

        while not done:
            next_state, reward, done = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
            action = pi[state]
        return episode

    for _ in range(episodes):
        # Exploring starts: random (state, action) pair with > 0 probability
        s0 = random.choice(env.non_terminal_states())
        a0 = random.choice(env.actions(s0))

        episode = generate_episode(s0, a0)
        G = 0
        visited = set()

        for t in reversed(range(len(episode))):
            s, a, r = episode[t] # get state, action, reward from T to 0
            G = gamma * G + r # calculate return
            if (s, a) not in visited:
                visited.add((s, a)) # store state-action pair
                Returns[s][a].append(G) # store returns
                Q[s][a] = sum(Returns[s][a]) / len(Returns[s][a]) # update Q
                Q_history[s][a].append(Q[s][a]) # store Q history
                pi[s] = max(env.actions(s), key=lambda x: Q[s][x]) # update policy

    return Q, pi, Q_history
import random

class SimulatedHealthcareEnv:
    def __init__(self):
        self.states = ["stable", "moderate", "critical", "dead"]
        self._terminal_states = {"dead"}
        self.state = None

        # Hidden transition model (not visible to agent)
        self.transition_model = {
            "stable": {
                "lifestyle": [("stable", 0.80, 8), ("moderate", 0.10, -2), ("critical", 0.05, -10), ("dead", 0.05, -100)],
                "medicate": [("stable", 0.85, 6), ("moderate", 0.10, -3), ("critical", 0.03, -10), ("dead", 0.02, -100)],
                "intensive_care": [("stable", 0.90, -5), ("moderate", 0.05, -8), ("critical", 0.03, -15), ("dead", 0.02, -100)],
            },
            "moderate": {
                "lifestyle": [("stable", 0.50, 5), ("moderate", 0.30, -1), ("critical", 0.15, -6), ("dead", 0.05, -50)],
                "medicate": [("stable", 0.60, 6), ("moderate", 0.25, 0), ("critical", 0.10, -5), ("dead", 0.05, -50)],
                "intensive_care": [("stable", 0.70, 3), ("moderate", 0.20, -2), ("critical", 0.05, -10), ("dead", 0.05, -80)],
            },
            "critical": {
                "lifestyle": [("stable", 0.05, 2), ("moderate", 0.10, -3), ("critical", 0.50, -12), ("dead", 0.35, -100)],
                "medicate": [("stable", 0.10, 3), ("moderate", 0.20, -2), ("critical", 0.50, -8), ("dead", 0.20, -100)],
                "intensive_care": [("stable", 0.30, 5), ("moderate", 0.30, 0), ("critical", 0.30, -3), ("dead", 0.10, -90)],
            }
        }

    def reset(self, state=None):
        self.state = state or random.choice(self.non_terminal_states())
        return self.state

    def non_terminal_states(self):
        return [s for s in self.states if s not in self._terminal_states]

    def is_terminal(self, s):
        return s in self._terminal_states

    def actions(self, s):
        if self.is_terminal(s):
            return []
        return list(self.transition_model[s].keys())

    def step(self, action):
        transitions = self.transition_model[self.state][action] # get the transitions for the current state and action
        next_states, probs, rewards = zip(*transitions) # unpack the transitions
        idx = random.choices(range(len(probs)), weights=probs)[0] # select an index based on the probabilities
        next_state = next_states[idx] # get the next state
        reward = rewards[idx] # get the reward
        done = self.is_terminal(next_state) # check if the next state is terminal
        self.state = next_state # update the current state
        return next_state, reward, done
env = SimulatedHealthcareEnv()
Q, pi, q_history = monte_carlo_es(env, gamma=0.95, episodes=10000)

print("Optimal Policy Learned via Monte Carlo ES:")
for state in env.states:
    if state in pi:
        print(f"State: {state:10s} | Best Action: {pi[state]}")
    else:
        print(f"State: {state:10s} | [Terminal]")
Optimal Policy Learned via Monte Carlo ES:
State: stable     | Best Action: intensive_care
State: moderate   | Best Action: lifestyle
State: critical   | Best Action: intensive_care
State: dead       | [Terminal]
from collections import defaultdict
import random

def mc_control_on_policy(env, gamma=0.95, epsilon=0.1, episodes=5000):
    """
    On-policy First-Visit Monte Carlo Control for ε-soft policies.

    Returns:
        - Q: state-action value function
        - pi: policy as a probability distribution over actions
    """

    Q = defaultdict(lambda: defaultdict(float))
    Returns = defaultdict(lambda: defaultdict(list))
    pi = defaultdict(dict)  # pi[state][action] = probability

    # Initialize ε-soft policy arbitrarily
    for s in env.non_terminal_states():
        actions = env.actions(s)
        for a in actions:
            pi[s][a] = 1.0 / len(actions)

    def select_action(state):
        """Sample action from current ε-soft policy."""
        actions = list(pi[state].keys())
        probs = list(pi[state].values())
        return random.choices(actions, weights=probs)[0]

    def generate_episode():
        """Generate one episode following the current policy."""
        episode = []
        state = env.reset()
        done = False
        while not done:
            action = select_action(state)
            next_state, reward, done = env.step(action)
            episode.append((state, action, reward))
            state = next_state
        return episode

    for _ in range(episodes):
        episode = generate_episode()
        G = 0
        visited = set()

        for t in reversed(range(len(episode))):
            s, a, r = episode[t]
            G = gamma * G + r
            if (s, a) not in visited:
                visited.add((s, a))
                Returns[s][a].append(G)
                Q[s][a] = sum(Returns[s][a]) / len(Returns[s][a])

                # Improve policy at state s
                actions = env.actions(s)
                best_action = max(actions, key=lambda x: Q[s][x])
                for act in actions:
                    if act == best_action:
                        pi[s][act] = 1 - epsilon + epsilon / len(actions)
                    else:
                        pi[s][act] = epsilon / len(actions)

    return Q, pi

env = SimulatedHealthcareEnv()
Q, pi = mc_control_on_policy(env, gamma=0.95, epsilon=0.1, episodes=10000)

print("Learned ε-soft Policy:")
for state in env.non_terminal_states():
    print(f"State: {state}")
    for action in pi[state]:
        print(f"  Action: {action:15s} | Prob: {pi[state][action]:.2f}")
Learned ε-soft Policy:
State: stable
  Action: lifestyle       | Prob: 0.03
  Action: medicate        | Prob: 0.93
  Action: intensive_care  | Prob: 0.03
State: moderate
  Action: lifestyle       | Prob: 0.03
  Action: medicate        | Prob: 0.03
  Action: intensive_care  | Prob: 0.93
State: critical
  Action: lifestyle       | Prob: 0.03
  Action: medicate        | Prob: 0.03
  Action: intensive_care  | Prob: 0.93

3. Python implementation of SARSA algorithm

from collections import defaultdict
import random

def sarsa(env, alpha=0.1, gamma=0.95, epsilon=0.1, episodes=5000):
    """
    SARSA: On-policy TD control.

    Args:
        env: A simulated environment (like SimulatedHealthcareEnv)
        alpha: Step size (learning rate)
        gamma: Discount factor
        epsilon: Exploration probability for ε-greedy policy
        episodes: Number of episodes to train

    Returns:
        Q: State-action value function
        pi: Final greedy policy derived from Q
    """

    Q = defaultdict(lambda: defaultdict(float))
    q_history = defaultdict(lambda: defaultdict(list))

    def epsilon_greedy_action(state):
        actions = env.actions(state)
        if not actions:
            return None
        if random.random() < epsilon:
            return random.choice(actions)
        else:
            return max(actions, key=lambda a: Q[state][a])

    for _ in range(episodes):
        state = env.reset()
        action = epsilon_greedy_action(state)

        while not env.is_terminal(state):
            next_state, reward, done = env.step(action)
            next_action = epsilon_greedy_action(next_state)

            # TD Update
            td_target = reward + gamma * Q[next_state][next_action] if not done else reward
            td_error = td_target - Q[state][action]
            Q[state][action] += alpha * td_error

            state, action = next_state, next_action

    # Derive final greedy policy
    pi = {}
    for s in env.non_terminal_states():
        actions = env.actions(s)
        if actions:
            pi[s] = max(actions, key=lambda a: Q[s][a])

    return Q, pi
env = SimulatedHealthcareEnv()
Q, pi = sarsa(env, alpha=0.1, gamma=0.95, epsilon=0.1, episodes=10000)

print("Final Greedy Policy (SARSA):")
for state in env.states:
    if state in pi:
        print(f"State: {state:10s} | Best Action: {pi[state]}")
    else:
        print(f"State: {state:10s} | [Terminal]")
Final Greedy Policy (SARSA):
State: stable     | Best Action: lifestyle
State: moderate   | Best Action: intensive_care
State: critical   | Best Action: intensive_care
State: dead       | [Terminal]

4. Python implementation of Q-learning algorithm

from collections import defaultdict
import random

def q_learning(env, alpha=0.1, gamma=0.95, epsilon=0.1, episodes=5000):
    """
    Q-learning: Off-policy TD control to estimate the optimal policy.

    Args:
        env: A simulation environment (e.g., SimulatedHealthcareEnv)
        alpha: Step size (learning rate)
        gamma: Discount factor
        epsilon: Exploration rate for ε-greedy policy
        episodes: Number of training episodes

    Returns:
        Q: State-action value function
        pi: Final greedy policy derived from Q
    """

    Q = defaultdict(lambda: defaultdict(float))

    def epsilon_greedy_action(state):
        actions = env.actions(state)
        if not actions:
            return None
        if random.random() < epsilon:
            return random.choice(actions)
        else:
            return max(actions, key=lambda a: Q[state][a])

    for _ in range(episodes):
        state = env.reset()

        while not env.is_terminal(state):
            action = epsilon_greedy_action(state)
            next_state, reward, done = env.step(action)

            # Off-policy TD target (uses greedy action for S')
            if not done:
                max_Q_next = max(Q[next_state][a] for a in env.actions(next_state))
            else:
                max_Q_next = 0.0

            td_target = reward + gamma * max_Q_next
            td_error = td_target - Q[state][action]
            Q[state][action] += alpha * td_error

            state = next_state

    # Derive the greedy policy from Q
    pi = {}
    for s in env.non_terminal_states():
        actions = env.actions(s)
        if actions:
            pi[s] = max(actions, key=lambda a: Q[s][a])

    return Q, pi
print("Final Greedy Policy (Q-learning):")
for state in env.states:
    if state in pi:
        print(f"State: {state:10s} | Best Action: {pi[state]}")
    else:
        print(f"State: {state:10s} | [Terminal]")
Final Greedy Policy (Q-learning):
State: stable     | Best Action: lifestyle
State: moderate   | Best Action: intensive_care
State: critical   | Best Action: intensive_care
State: dead       | [Terminal]