Source code for hive.agents.rainbow

import copy
from functools import partial
from typing import Tuple

import numpy as np
import torch

from hive.agents.dqn import DQNAgent
from hive.agents.qnets.base import FunctionApproximator
from hive.agents.qnets.noisy_linear import NoisyLinear
from hive.agents.qnets.qnet_heads import (
    DistributionalNetwork,
    DQNNetwork,
    DuelingNetwork,
)
from hive.agents.qnets.utils import InitializationFn, calculate_output_dim
from hive.replays import PrioritizedReplayBuffer
from hive.replays.replay_buffer import BaseReplayBuffer
from hive.utils.loggers import Logger
from hive.utils.schedule import Schedule
from hive.utils.utils import LossFn, OptimizerFn, seeder


[docs]class RainbowDQNAgent(DQNAgent): """An agent implementing the Rainbow algorithm.""" def __init__( self, representation_net: FunctionApproximator, obs_dim: Tuple, act_dim: int, optimizer_fn: OptimizerFn = None, loss_fn: LossFn = None, init_fn: InitializationFn = None, id=0, replay_buffer: BaseReplayBuffer = None, discount_rate: float = 0.99, n_step: int = 1, grad_clip: float = None, reward_clip: float = None, update_period_schedule: Schedule = None, target_net_soft_update: bool = False, target_net_update_fraction: float = 0.05, target_net_update_schedule: Schedule = None, epsilon_schedule: Schedule = None, test_epsilon: float = 0.001, min_replay_history: int = 5000, batch_size: int = 32, device="cpu", logger: Logger = None, log_frequency: int = 100, noisy: bool = True, std_init: float = 0.5, use_eps_greedy: bool = False, double: bool = True, dueling: bool = True, distributional: bool = True, v_min: float = 0, v_max: float = 200, atoms: int = 51, ): """ Args: representation_net (FunctionApproximator): A network that outputs the representations that will be used to compute Q-values (e.g. everything except the final layer of the DQN). obs_dim: The shape of the observations. act_dim (int): The number of actions available to the agent. id: Agent identifier. optimizer_fn (OptimizerFn): A function that takes in a list of parameters to optimize and returns the optimizer. If None, defaults to :py:class:`~torch.optim.Adam`. loss_fn (LossFn): Loss function used by the agent. If None, defaults to :py:class:`~torch.nn.SmoothL1Loss`. init_fn (InitializationFn): Initializes the weights of qnet using create_init_weights_fn. replay_buffer (BaseReplayBuffer): The replay buffer that the agent will push observations to and sample from during learning. If None, defaults to :py:class:`~hive.replays.prioritized_replay.PrioritizedReplayBuffer`. discount_rate (float): A number between 0 and 1 specifying how much future rewards are discounted by the agent. n_step (int): The horizon used in n-step returns to compute TD(n) targets. grad_clip (float): Gradients will be clipped to between [-grad_clip, grad_clip]. reward_clip (float): Rewards will be clipped to between [-reward_clip, reward_clip]. update_period_schedule (Schedule): Schedule determining how frequently the agent's Q-network is updated. target_net_soft_update (bool): Whether the target net parameters are replaced by the qnet parameters completely or using a weighted average of the target net parameters and the qnet parameters. target_net_update_fraction (float): The weight given to the target net parameters in a soft update. target_net_update_schedule (Schedule): Schedule determining how frequently the target net is updated. epsilon_schedule (Schedule): Schedule determining the value of epsilon through the course of training. test_epsilon (float): epsilon (probability of choosing a random action) to be used during testing phase. min_replay_history (int): How many observations to fill the replay buffer with before starting to learn. batch_size (int): The size of the batch sampled from the replay buffer during learning. device: Device on which all computations should be run. logger (ScheduledLogger): Logger used to log agent's metrics. log_frequency (int): How often to log the agent's metrics. noisy (bool): Whether to use noisy linear layers for exploration. std_init (float): The range for the initialization of the standard deviation of the weights. use_eps_greedy (bool): Whether to use epsilon greedy exploration. double (bool): Whether to use double DQN. dueling (bool): Whether to use a dueling network architecture. distributional (bool): Whether to use the distributional RL. vmin (float): The minimum of the support of the categorical value distribution for distributional RL. vmax (float): The maximum of the support of the categorical value distribution for distributional RL. atoms (int): Number of atoms discretizing the support range of the categorical value distribution for distributional RL. """ self._noisy = noisy self._std_init = std_init self._double = double self._dueling = dueling self._distributional = distributional self._atoms = atoms if self._distributional else 1 self._v_min = v_min self._v_max = v_max if loss_fn is None: loss_fn = torch.nn.MSELoss if replay_buffer is None: replay_buffer = PrioritizedReplayBuffer(seed=seeder.get_new_seed()) super().__init__( representation_net, obs_dim, act_dim, optimizer_fn=optimizer_fn, init_fn=init_fn, loss_fn=loss_fn, id=id, replay_buffer=replay_buffer, discount_rate=discount_rate, n_step=n_step, grad_clip=grad_clip, reward_clip=reward_clip, target_net_soft_update=target_net_soft_update, target_net_update_fraction=target_net_update_fraction, target_net_update_schedule=target_net_update_schedule, update_period_schedule=update_period_schedule, epsilon_schedule=epsilon_schedule, test_epsilon=test_epsilon, min_replay_history=min_replay_history, batch_size=batch_size, device=device, logger=logger, log_frequency=log_frequency, ) self._supports = torch.linspace( self._v_min, self._v_max, self._atoms, device=self._device ) self._use_eps_greedy = use_eps_greedy
[docs] def create_q_networks(self, representation_net): """Creates the Q-network and target Q-network. Adds the appropriate heads for DQN, Dueling DQN, Noisy Networks, and Distributional DQN. Args: representation_net: A network that outputs the representations that will be used to compute Q-values (e.g. everything except the final layer of the DQN). """ network = representation_net(self._obs_dim) network_output_dim = np.prod(calculate_output_dim(network, self._obs_dim)) # Use NoisyLinear when creating output heads if noisy is true linear_fn = ( partial(NoisyLinear, std_init=self._std_init) if self._noisy else torch.nn.Linear ) # Set up Dueling heads if self._dueling: network = DuelingNetwork( network, network_output_dim, self._act_dim, linear_fn, self._atoms ) else: network = DQNNetwork( network, network_output_dim, self._act_dim * self._atoms, linear_fn ) # Set up DistributionalNetwork wrapper if distributional is true if self._distributional: self._qnet = DistributionalNetwork( network, self._act_dim, self._v_min, self._v_max, self._atoms ) else: self._qnet = network self._qnet.to(device=self._device) self._qnet.apply(self._init_fn) self._target_qnet = copy.deepcopy(self._qnet).requires_grad_(False)
[docs] @torch.no_grad() def act(self, observation): if self._training: if not self._learn_schedule.get_value(): epsilon = 1.0 elif not self._use_eps_greedy: epsilon = 0.0 else: epsilon = self._epsilon_schedule.update() if self._logger.update_step(self._timescale): self._logger.log_scalar("epsilon", epsilon, self._timescale) else: epsilon = self._test_epsilon observation = torch.tensor( np.expand_dims(observation, axis=0), device=self._device ).float() qvals = self._qnet(observation) if self._rng.random() < epsilon: action = self._rng.integers(self._act_dim) else: action = torch.argmax(qvals).item() if ( self._training and self._logger.should_log(self._timescale) and self._state["episode_start"] ): self._logger.log_scalar("train_qval", torch.max(qvals), self._timescale) self._state["episode_start"] = False return action
[docs] def update(self, update_info): """ Updates the DQN agent. Args: update_info: dictionary containing all the necessary information to update the agent. Should contain a full transition, with keys for "observation", "action", "reward", "next_observation", and "done". """ if update_info["done"]: self._state["episode_start"] = True if not self._training: return # Add the most recent transition to the replay buffer. self._replay_buffer.add(**self.preprocess_update_info(update_info)) # Update the q network based on a sample batch from the replay buffer. # If the replay buffer doesn't have enough samples, catch the exception # and move on. if ( self._learn_schedule.update() and self._replay_buffer.size() > 0 and self._update_period_schedule.update() ): batch = self._replay_buffer.sample(batch_size=self._batch_size) ( current_state_inputs, next_state_inputs, batch, ) = self.preprocess_update_batch(batch) # Compute predicted Q values self._optimizer.zero_grad() pred_qvals = self._qnet(*current_state_inputs) actions = batch["action"].long() if self._double: next_action = self._qnet(*next_state_inputs) else: next_action = self._target_qnet(*next_state_inputs) next_action = next_action.argmax(1) if self._distributional: current_dist = self._qnet.dist(*current_state_inputs) probs = current_dist[torch.arange(actions.size(0)), actions] probs = torch.clamp(probs, 1e-6, 1) # NaN-guard log_p = torch.log(probs) with torch.no_grad(): target_prob = self.target_projection( next_state_inputs, next_action, batch["reward"], batch["done"] ) loss = -(target_prob * log_p).sum(-1) else: pred_qvals = pred_qvals[torch.arange(pred_qvals.size(0)), actions] next_qvals = self._target_qnet(*next_state_inputs) next_qvals = next_qvals[torch.arange(next_qvals.size(0)), next_action] q_targets = batch["reward"] + self._discount_rate * next_qvals * ( 1 - batch["done"] ) loss = self._loss_fn(pred_qvals, q_targets) if isinstance(self._replay_buffer, PrioritizedReplayBuffer): td_errors = loss.sqrt().detach().cpu().numpy() self._replay_buffer.update_priorities(batch["indices"], td_errors) loss *= batch["weights"] loss = loss.mean() if self._logger.should_log(self._timescale): self._logger.log_scalar( "train_loss", loss, self._timescale, ) loss.backward() if self._grad_clip is not None: torch.nn.utils.clip_grad_value_( self._qnet.parameters(), self._grad_clip ) self._optimizer.step() # Update target network if self._target_net_update_schedule.update(): self._update_target()
[docs] def target_projection(self, target_net_inputs, next_action, reward, done): """Project distribution of target Q-values. Args: target_net_inputs: Inputs to feed into the target net to compute the projection of the target Q-values. Should be set from :py:meth:`~hive.agents.dqn.DQNAgent.preprocess_update_batch`. next_action (~torch.Tensor): Tensor containing next actions used to compute target distribution. reward (~torch.Tensor): Tensor containing rewards for the current batch. done (~torch.Tensor): Tensor containing whether the states in the current batch are terminal. """ reward = reward.reshape(-1, 1) not_done = 1 - done.reshape(-1, 1) batch_size = reward.size(0) next_dist = self._target_qnet.dist(*target_net_inputs) next_dist = next_dist[torch.arange(batch_size), next_action] dist_supports = reward + not_done * self._discount_rate * self._supports dist_supports = dist_supports.clamp(min=self._v_min, max=self._v_max) dist_supports = dist_supports.unsqueeze(1) dist_supports = dist_supports.tile([1, self._atoms, 1]) projected_supports = self._supports.tile([batch_size, 1]).unsqueeze(2) delta = float(self._v_max - self._v_min) / (self._atoms - 1) quotient = 1 - (torch.abs(dist_supports - projected_supports) / delta) quotient = quotient.clamp(min=0, max=1) projection = torch.sum(quotient * next_dist.unsqueeze(1), dim=2) return projection