Added SQN0, DQL0 for vector input. To do: finish distral, tune hyperp…

…arams, play_game at utils.
Alfo5123 · Mar 17, 2018 · 7279268 · 7279268
1 parent 8701282
commit 7279268
Show file tree

Hide file tree

Showing 27 changed files with 1,574 additions and 783 deletions.
diff --git a/code/distral/distral.py b/code/distral/distral.py
@@ -0,0 +1,119 @@
+import math
+import random
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.autograd import Variable
+from memory_replay import Transition
+
+use_cuda = torch.cuda.is_available()
+
+FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
+LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
+ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
+Tensor = FloatTensor
+
+
+class DQN(nn.Module):
+    """
+    Deep neural network with represents an agent.
+    """
+    def __init__(self, num_actions):
+        super(DQN, self).__init__()
+        self.conv1 = nn.Conv2d(1, 5, kernel_size=2)
+        self.bn1 = nn.BatchNorm2d(5)
+        self.conv2 = nn.Conv2d(5, 10, kernel_size=3)
+        self.bn2 = nn.BatchNorm2d(10)
+        self.conv3 = nn.Conv2d(10, 10, kernel_size=3)
+        self.bn3 = nn.BatchNorm2d(10)
+        self.head = nn.Linear(200, num_actions)
+
+    def forward(self, x):
+        x = F.leaky_relu(self.bn1(self.conv1(x)))
+        x = F.leaky_relu(self.bn2(self.conv2(x)))
+        x = F.leaky_relu(self.bn3(self.conv3(x)))
+        return self.head(x.view(x.size(0), -1))
+
+# class DQN(nn.Module):
+#     """
+#     Deep neural network with represents an agent.
+#     """
+#     def __init__(self, num_actions):
+#         super(DQN, self).__init__()
+#         self.conv1 = nn.Conv2d(1, 10, kernel_size=2)
+#         self.max_pool = nn.MaxPool2d((2,2))
+#         self.bn1 = nn.BatchNorm2d(10)
+#         self.conv2 = nn.Conv2d(10, 20, kernel_size=3)
+#         self.bn2 = nn.BatchNorm2d(20)
+#         self.linear = nn.Linear(80, 20)
+#         # self.bn3 = nn.BatchNorm1d(50)
+#         self.head = nn.Linear(20, num_actions)
+
+#     def forward(self, x):
+#         x = F.leaky_relu(self.max_pool(self.bn1(self.conv1(x))))
+#         x = F.leaky_relu(self.bn2(self.conv2(x)))
+#         x = F.leaky_relu(self.linear(x.view(x.size(0), -1)))
+#         return self.head(x)
+
+def select_action(state, model, num_actions,
+                    EPS_START, EPS_END, EPS_DECAY, steps_done):
+    """
+    Selects whether the next action is choosen by our model or randomly
+    """
+    sample = random.random()
+    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
+        math.exp(-1. * steps_done / EPS_DECAY)
+    if sample > eps_threshold:
+        return model(
+            Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
+    else:
+        return LongTensor([[random.randrange(num_actions)]])
+
+
+def optimize_model(model, optimizer, memory, BATCH_SIZE, GAMMA, BETA):
+    global last_sync
+    if len(memory) < BATCH_SIZE:
+        return
+    transitions = memory.sample(BATCH_SIZE)
+    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
+    # detailed explanation).
+    batch = Transition(*zip(*transitions))
+
+    # Compute a mask of non-final states and concatenate the batch elements
+    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
+                                          batch.next_state)))
+    # We don't want to backprop through the expected action values and volatile
+    # will save us on temporarily changing the model parameters'
+    # requires_grad to False!
+    non_final_next_states = Variable(torch.cat([s for s in batch.next_state
+                                                if s is not None]),
+                                     volatile=True)
+    state_batch = Variable(torch.cat(batch.state))
+    action_batch = Variable(torch.cat(batch.action))
+    reward_batch = Variable(torch.cat(batch.reward))
+
+    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
+    # columns of actions taken
+    state_action_values = model(state_batch).gather(1, action_batch)
+
+    # Compute V(s_{t+1}) for all next states.
+    next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
+    next_state_values[non_final_mask] = torch.log( torch.exp(
+                        BETA * model(non_final_next_states)).sum(1)) / BETA
+    # Now, we don't want to mess up the loss with a volatile flag, so let's
+    # clear it. After this, we'll just end up with a Variable that has
+    # requires_grad=False
+    next_state_values.volatile = False
+    # Compute the expected Q values
+    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
+
+    # Compute Huber loss
+    loss = F.mse_loss(state_action_values, expected_state_action_values)
+
+    # Optimize the model
+    optimizer.zero_grad()
+    loss.backward()
+    for param in model.parameters():
+        param.grad.data.clamp_(-1, 1)
+    optimizer.step()
diff --git a/code/distral/memory_replay.py b/code/distral/memory_replay.py
@@ -0,0 +1,26 @@
+import random
+from collections import namedtuple
+
+Transition = namedtuple('Transition',
+                        ('state', 'action', 'next_state', 'reward'))
+
+class ReplayMemory(object):
+
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.memory = []
+        self.position = 0
+
+    def push(self, *args):
+        """Saves a transition."""
+        if len(self.memory) < self.capacity:
+            self.memory.append(None)
+        self.memory[self.position] = Transition(*args)
+        self.position = (self.position + 1) % self.capacity
+
+    def sample(self, batch_size):
+        return random.sample(self.memory, batch_size)
+
+    def __len__(self):
+        return len(self.memory)
+
diff --git a/code/distral/trainingDistral.py b/code/distral/trainingDistral.py
@@ -0,0 +1,105 @@
+import matplotlib
+import matplotlib.pyplot as plt
+from itertools import count
+import torch.optim as optim
+import torch
+import math
+import numpy as np
+from memory_replay import ReplayMemory, Transition
+from network import DQN, select_action, optimize_model, Tensor
+import sys
+sys.path.append('../')
+from envs.gridworld_env import GridworldEnv
+from utils import plot_rewards, plot_durations, plot_state, get_screen
+
+def trainSQL(file_name="SQL", env=GridworldEnv(1), batch_size=128,
+            gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000,
+            is_plot=False, num_episodes=500, max_num_steps_per_episode=1000,
+            learning_rate=0.001, memory_replay_size=10000):
+    """
+    Soft Q-learning training routine. Retuns rewards and durations logs.
+    Plot environment screen
+    """
+    if is_plot:
+        env.reset()
+        plt.ion()
+        plt.figure()
+        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
+                   interpolation='none')
+        plt.draw()
+        plt.pause(0.00001)
+
+    num_actions = env.action_space.n
+    model = DQN(num_actions)
+    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+    # optimizer = optim.RMSprop(model.parameters(), )
+
+    use_cuda = torch.cuda.is_available()
+    if use_cuda:
+        model.cuda()
+
+    memory = ReplayMemory(memory_replay_size)
+
+    episode_durations = []
+    mean_durations = []
+    episode_rewards = []
+    mean_rewards = []
+
+    steps_done, t = 0, 0
+    # plt.ion()
+    for i_episode in range(num_episodes):
+        print("Cur episode:", i_episode, "steps done:", t,
+                "exploration factor:", eps_end + (eps_start - eps_end) * \
+                math.exp(-1. * steps_done / eps_decay))
+        # Initialize the environment and state
+        env.reset()
+        # last_screen = env.current_grid_map
+        current_screen = get_screen(env)
+        state = current_screen # - last_screen
+        for t in count():
+            # Select and perform an action
+            action = select_action(state, model, num_actions,
+                                    eps_start, eps_end, eps_decay, steps_done)
+            _, reward, done, _ = env.step(action[0, 0])
+            reward = Tensor([reward])
+
+            # Observe new state
+            last_screen = current_screen
+            current_screen = get_screen(env)
+            if not done:
+                next_state = current_screen # - last_screen
+            else:
+                next_state = None
+
+            # Store the transition in memory
+            memory.push(state, action, next_state, reward)
+
+            # Move to the next state
+            state = next_state
+            # plot_state(state)
+            # env.render()
+
+            # Perform one step of the optimization (on the target network)
+            optimize_model(model, optimizer, memory, batch_size, gamma, beta)
+            if done or t + 1 >= max_num_steps_per_episode:
+                episode_durations.append(t + 1)
+                episode_rewards.append(env.episode_total_reward)
+                if is_plot:
+                    plot_durations(episode_durations, mean_durations)
+                    plot_rewards(episode_rewards, mean_rewards)
+                steps_done += 1
+                break
+
+    print('Complete')
+    env.render(close=True)
+    env.close()
+    if is_plot:
+        plt.ioff()
+        plt.show()
+
+    ## Store Results
+
+    np.save(file_name + '-sql-rewards', episode_rewards)
+    np.save(file_name + '-sql-durations', episode_durations)
+
+    return model, episode_rewards, episode_durations
diff --git a/code/dqn0/__pycache__/memory_replay.cpython-36.pyc b/code/dqn0/__pycache__/memory_replay.cpython-36.pyc
diff --git a/code/dqn0/__pycache__/network.cpython-36.pyc b/code/dqn0/__pycache__/network.cpython-36.pyc
diff --git a/code/dqn0/__pycache__/trainingDQN0.cpython-36.pyc b/code/dqn0/__pycache__/trainingDQN0.cpython-36.pyc
diff --git a/code/dqn0/memory_replay.py b/code/dqn0/memory_replay.py
@@ -0,0 +1,26 @@
+import random
+from collections import namedtuple
+
+Transition = namedtuple('Transition',
+                        ('state', 'action', 'next_state', 'reward'))
+
+class ReplayMemory(object):
+
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.memory = []
+        self.position = 0
+
+    def push(self, *args):
+        """Saves a transition."""
+        if len(self.memory) < self.capacity:
+            self.memory.append(None)
+        self.memory[self.position] = Transition(*args)
+        self.position = (self.position + 1) % self.capacity
+
+    def sample(self, batch_size):
+        return random.sample(self.memory, batch_size)
+
+    def __len__(self):
+        return len(self.memory)
+
diff --git a/code/dqn0/network.py b/code/dqn0/network.py
@@ -0,0 +1,90 @@
+import math
+import random
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.autograd import Variable
+from memory_replay import Transition
+
+use_cuda = torch.cuda.is_available()
+
+FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
+LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
+ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
+Tensor = FloatTensor
+
+class DQN(nn.Module):
+    """
+    Deep neural network with represents an agent.
+    """
+    def __init__(self, input_size, num_actions):
+        super(DQN, self).__init__()
+        self.l1 = nn.Linear(input_size,100)    ## To play with different NN architectures
+        self.l2 = nn.Linear(100, num_actions)
+
+    def forward(self, x):
+        return self.l2(F.relu(self.l1(x)))
+
+
+def select_action(state, model, num_actions,
+                    EPS_START, EPS_END, EPS_DECAY, steps_done):
+    """
+    Selects whether the next action is choosen by our model or randomly
+    """
+    sample = random.random()
+    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
+        math.exp(-1. * steps_done / EPS_DECAY)
+    if sample > eps_threshold:
+    	return model(
+            Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
+    else:
+        return LongTensor([[random.randrange(num_actions)]])
+
+
+def optimize_model(model, optimizer,  memory, BATCH_SIZE, GAMMA):
+    global last_sync
+    if len(memory) < BATCH_SIZE:
+        return
+    transitions = memory.sample(BATCH_SIZE)
+    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
+    # detailed explanation).
+    batch = Transition(*zip(*transitions))
+
+    # Compute a mask of non-final states and concatenate the batch elements
+    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
+                                          batch.next_state)))
+
+    # We don't want to backprop through the expected action values and volatile
+    # will save us on temporarily changing the model parameters'
+    # requires_grad to False!
+    non_final_next_states = Variable(torch.cat([s for s in batch.next_state
+                                                if s is not None]),
+                                     volatile=True)
+    state_batch = Variable(torch.cat(batch.state))
+    action_batch = Variable(torch.cat(batch.action))
+    reward_batch = Variable(torch.cat(batch.reward))
+
+    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
+    # columns of actions taken
+    state_action_values = model(state_batch).gather(1, action_batch)
+
+    # Compute V(s_{t+1}) for all next states.
+    next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
+    next_state_values[non_final_mask] = model(non_final_next_states).max(1)[0]
+    # Now, we don't want to mess up the loss with a volatile flag, so let's
+    # clear it. After this, we'll just end up with a Variable that has
+    # requires_grad=False
+    next_state_values.volatile = False
+    # Compute the expected Q values
+    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
+
+    # Compute Huber loss
+    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
+
+    # Optimize the model
+    optimizer.zero_grad()
+    loss.backward()
+    for param in model.parameters():
+        param.grad.data.clamp_(-1, 1)
+    optimizer.step()