Source code for train.state

"""

:class:`~train.State` objects can be used to represent the agent's state. They can be used to save the recent observations seen by agent and process them before passing to the :func:`~train.Agent.act` method. The following example saves last 2 observations (images) after transforming them (crop, scale etc.) and computes the difference between them which can be useful for tracking motion:

.. code:: python

    from train import State

    class MyState(State):

        def __init__(self, **kwargs):
            super(MyState, self).__init__(length=2, **kwargs)

        def process_observation(self, observation):
            x = observation
            x = x[35:-15, :, :] # crop
            x = np.dot(x, [.299, .587, .114]) # grayscale
            x = x / 255 # scale
            return x

        def process_state(self, state):
            prev, current = state
            diff = current - prev
            return diff.reshape(diff.shape + (1, ))

Custom state objects can be passed to agent during initialization:

.. code:: python

    state = MyState()
    agent = MyAgent(state=state, env=env)
"""

import random
from collections import namedtuple, deque

import numpy as np

from .utils import zeros_like

Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state', 'done'))


[docs]class State():
    """
    Core class to represent agent's state. Saves recent observations seen by agent.

    Args:
        length (int): Number of recent observations to save.
        zeros (array_like): Array of zeros with same shape as each observation that will be used to pad initial states when number of recent observations is smaller than length of state.
    """

    def __init__(self, length=0, zeros=None):
        self.length = length
        self.zeros = zeros
        self.reset()

[docs]    def update(self, observation):
        """Update the current state based on new observation.

        Args:
            observation (array_like): Observation returned by environment.
        """
        assert observation is not None
        observation = self.process_observation(observation)
        if self.zeros is None:
            self.zeros = zeros_like(observation)
            self.pad()
        if self.length == 0:
            self.data = observation
        else:
            self.data.append(observation)

[docs]    def process_observation(self, observation):
        """Process observation before saving it.

        Args:
            observation (array_like): Observation returned by environment.

        Returns:
            array_like: Processed observation.
        """
        return observation

[docs]    def get(self, asarray=True, dtype='float32'):
        """Get the current state.

        Args:
            asarray (bool): If ``True`` returns an :class:`~numpy.ndarray`.
            dtype (~numpy.dtype): Data type of the returned value.

        Returns:
            (array_like, list): Processed state.
        """
        if self.length == 0:
            state = self.data
        else:
            state = list(self.data)
        state = self.process_state(state)
        if asarray:
            state = np.array(state, dtype=dtype)
        return state

[docs]    def process_state(self, state):
        """Process state before passing it to :func:`~train.Agent.act`.

        Args:
            state (array_like, list): List of recent observations.

        Returns:
            (array_like, list): Processed state.
        """
        return state

[docs]    def reset(self):
        """Reset current state.
        """
        if self.length == 0:
            self.data = None
        else:
            self.data = deque(maxlen=self.length)
        if self.zeros is not None:
            self.pad()

    def pad(self):
        assert self.zeros is not None
        if self.length == 0 and self.data is None:
            self.data = self.zeros
        else:
            while len(self.data) < self.length:
                self.data.appendleft(self.zeros)


# See https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#training-loop
class RingBuffer():

    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.reset()

    def append(self, item):
        maxlen = self.maxlen
        if len(self.data) < maxlen or maxlen <= 0:
            self.data.append(None)
        self.data[self.pos] = item
        self.pos += 1
        if maxlen > 0:
            self.pos %= maxlen

    def get(self):
        return self.data[self.pos:] + self.data[:self.pos]

    def last(self):
        """Return last transition.

        Returns:
            Transition: Last transition.

        Raises:
            IndexError: When it is empty.
        """
        return self.data[self.pos - 1]

    def sample(self, batch_size):
        return random.sample(self.data, batch_size)

    def reset(self):
        """Reset transitions.
        """
        self.data = []
        self.pos = 0

    def __len__(self):
        return len(self.data)


[docs]class Transitions(RingBuffer):
    """
    Queue like data structure to save recent transitions observed by agent. Can be used as a replay buffer for algorithms like DQN.

    Args:
        maxlen (int): Number of recent transitions to save. When negative, there is no limit on the number of transitions saved.
    """

[docs]    def get(self, **kwargs):
        """Get all transitions.

        Returns:
            (list, Transition): List of transitions or a Transition object containing lists of values.
        """
        data = super(Transitions, self).get()
        return self.get_transitions(data, **kwargs)

[docs]    def sample(self, batch_size, **kwargs):
        """Randomly sample transitions.

        Args:
            batch_size (int): Number of transitions to sample.

        Returns:
            (list, Transition): List of transitions or a Transition object containing lists of values.
        """
        data = super(Transitions, self).sample(batch_size)
        return self.get_transitions(data, **kwargs)

    # See https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#training-loop
    def get_transitions(self, data, **kwargs):
        transpose = kwargs.get('transpose', True)
        asarray = kwargs.get('asarray', True)
        dtype = kwargs.get('dtype', 'float32')
        if not transpose:
            return data
        data = Transition(*zip(*data))
        if not asarray:
            return data
        states = np.array(data.state, dtype=dtype)
        actions = np.array(data.action, dtype='int32')
        next_states = np.array(data.next_state, dtype=dtype)
        rewards = np.array(data.reward, dtype=dtype)
        dones = np.array(data.done, dtype='uint8')
        return Transition(state=states,
                          action=actions,
                          next_state=next_states,
                          reward=rewards,
                          done=dones)