Skip to content

Commit

Permalink
Added SQN0, DQL0 for vector input. To do: finish distral, tune hyperp…
Browse files Browse the repository at this point in the history
…arams, play_game at utils.
  • Loading branch information
Alfo5123 committed Mar 17, 2018
1 parent 8701282 commit 7279268
Show file tree
Hide file tree
Showing 27 changed files with 1,574 additions and 783 deletions.
119 changes: 119 additions & 0 deletions code/distral/distral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import math
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from memory_replay import Transition

use_cuda = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor


class DQN(nn.Module):
"""
Deep neural network with represents an agent.
"""
def __init__(self, num_actions):
super(DQN, self).__init__()
self.conv1 = nn.Conv2d(1, 5, kernel_size=2)
self.bn1 = nn.BatchNorm2d(5)
self.conv2 = nn.Conv2d(5, 10, kernel_size=3)
self.bn2 = nn.BatchNorm2d(10)
self.conv3 = nn.Conv2d(10, 10, kernel_size=3)
self.bn3 = nn.BatchNorm2d(10)
self.head = nn.Linear(200, num_actions)

def forward(self, x):
x = F.leaky_relu(self.bn1(self.conv1(x)))
x = F.leaky_relu(self.bn2(self.conv2(x)))
x = F.leaky_relu(self.bn3(self.conv3(x)))
return self.head(x.view(x.size(0), -1))

# class DQN(nn.Module):
# """
# Deep neural network with represents an agent.
# """
# def __init__(self, num_actions):
# super(DQN, self).__init__()
# self.conv1 = nn.Conv2d(1, 10, kernel_size=2)
# self.max_pool = nn.MaxPool2d((2,2))
# self.bn1 = nn.BatchNorm2d(10)
# self.conv2 = nn.Conv2d(10, 20, kernel_size=3)
# self.bn2 = nn.BatchNorm2d(20)
# self.linear = nn.Linear(80, 20)
# # self.bn3 = nn.BatchNorm1d(50)
# self.head = nn.Linear(20, num_actions)

# def forward(self, x):
# x = F.leaky_relu(self.max_pool(self.bn1(self.conv1(x))))
# x = F.leaky_relu(self.bn2(self.conv2(x)))
# x = F.leaky_relu(self.linear(x.view(x.size(0), -1)))
# return self.head(x)

def select_action(state, model, num_actions,
EPS_START, EPS_END, EPS_DECAY, steps_done):
"""
Selects whether the next action is choosen by our model or randomly
"""
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
math.exp(-1. * steps_done / EPS_DECAY)
if sample > eps_threshold:
return model(
Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
else:
return LongTensor([[random.randrange(num_actions)]])


def optimize_model(model, optimizer, memory, BATCH_SIZE, GAMMA, BETA):
global last_sync
if len(memory) < BATCH_SIZE:
return
transitions = memory.sample(BATCH_SIZE)
# Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
# detailed explanation).
batch = Transition(*zip(*transitions))

# Compute a mask of non-final states and concatenate the batch elements
non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
batch.next_state)))
# We don't want to backprop through the expected action values and volatile
# will save us on temporarily changing the model parameters'
# requires_grad to False!
non_final_next_states = Variable(torch.cat([s for s in batch.next_state
if s is not None]),
volatile=True)
state_batch = Variable(torch.cat(batch.state))
action_batch = Variable(torch.cat(batch.action))
reward_batch = Variable(torch.cat(batch.reward))

# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken
state_action_values = model(state_batch).gather(1, action_batch)

# Compute V(s_{t+1}) for all next states.
next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
next_state_values[non_final_mask] = torch.log( torch.exp(
BETA * model(non_final_next_states)).sum(1)) / BETA
# Now, we don't want to mess up the loss with a volatile flag, so let's
# clear it. After this, we'll just end up with a Variable that has
# requires_grad=False
next_state_values.volatile = False
# Compute the expected Q values
expected_state_action_values = (next_state_values * GAMMA) + reward_batch

# Compute Huber loss
loss = F.mse_loss(state_action_values, expected_state_action_values)

# Optimize the model
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
26 changes: 26 additions & 0 deletions code/distral/memory_replay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import random
from collections import namedtuple

Transition = namedtuple('Transition',
('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

def __init__(self, capacity):
self.capacity = capacity
self.memory = []
self.position = 0

def push(self, *args):
"""Saves a transition."""
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = Transition(*args)
self.position = (self.position + 1) % self.capacity

def sample(self, batch_size):
return random.sample(self.memory, batch_size)

def __len__(self):
return len(self.memory)

105 changes: 105 additions & 0 deletions code/distral/trainingDistral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import matplotlib
import matplotlib.pyplot as plt
from itertools import count
import torch.optim as optim
import torch
import math
import numpy as np
from memory_replay import ReplayMemory, Transition
from network import DQN, select_action, optimize_model, Tensor
import sys
sys.path.append('../')
from envs.gridworld_env import GridworldEnv
from utils import plot_rewards, plot_durations, plot_state, get_screen

def trainSQL(file_name="SQL", env=GridworldEnv(1), batch_size=128,
gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000,
is_plot=False, num_episodes=500, max_num_steps_per_episode=1000,
learning_rate=0.001, memory_replay_size=10000):
"""
Soft Q-learning training routine. Retuns rewards and durations logs.
Plot environment screen
"""
if is_plot:
env.reset()
plt.ion()
plt.figure()
plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
interpolation='none')
plt.draw()
plt.pause(0.00001)

num_actions = env.action_space.n
model = DQN(num_actions)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = optim.RMSprop(model.parameters(), )

use_cuda = torch.cuda.is_available()
if use_cuda:
model.cuda()

memory = ReplayMemory(memory_replay_size)

episode_durations = []
mean_durations = []
episode_rewards = []
mean_rewards = []

steps_done, t = 0, 0
# plt.ion()
for i_episode in range(num_episodes):
print("Cur episode:", i_episode, "steps done:", t,
"exploration factor:", eps_end + (eps_start - eps_end) * \
math.exp(-1. * steps_done / eps_decay))
# Initialize the environment and state
env.reset()
# last_screen = env.current_grid_map
current_screen = get_screen(env)
state = current_screen # - last_screen
for t in count():
# Select and perform an action
action = select_action(state, model, num_actions,
eps_start, eps_end, eps_decay, steps_done)
_, reward, done, _ = env.step(action[0, 0])
reward = Tensor([reward])

# Observe new state
last_screen = current_screen
current_screen = get_screen(env)
if not done:
next_state = current_screen # - last_screen
else:
next_state = None

# Store the transition in memory
memory.push(state, action, next_state, reward)

# Move to the next state
state = next_state
# plot_state(state)
# env.render()

# Perform one step of the optimization (on the target network)
optimize_model(model, optimizer, memory, batch_size, gamma, beta)
if done or t + 1 >= max_num_steps_per_episode:
episode_durations.append(t + 1)
episode_rewards.append(env.episode_total_reward)
if is_plot:
plot_durations(episode_durations, mean_durations)
plot_rewards(episode_rewards, mean_rewards)
steps_done += 1
break

print('Complete')
env.render(close=True)
env.close()
if is_plot:
plt.ioff()
plt.show()

## Store Results

np.save(file_name + '-sql-rewards', episode_rewards)
np.save(file_name + '-sql-durations', episode_durations)

return model, episode_rewards, episode_durations
Binary file not shown.
Binary file added code/dqn0/__pycache__/network.cpython-36.pyc
Binary file not shown.
Binary file added code/dqn0/__pycache__/trainingDQN0.cpython-36.pyc
Binary file not shown.
26 changes: 26 additions & 0 deletions code/dqn0/memory_replay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import random
from collections import namedtuple

Transition = namedtuple('Transition',
('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

def __init__(self, capacity):
self.capacity = capacity
self.memory = []
self.position = 0

def push(self, *args):
"""Saves a transition."""
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = Transition(*args)
self.position = (self.position + 1) % self.capacity

def sample(self, batch_size):
return random.sample(self.memory, batch_size)

def __len__(self):
return len(self.memory)

90 changes: 90 additions & 0 deletions code/dqn0/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import math
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from memory_replay import Transition

use_cuda = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

class DQN(nn.Module):
"""
Deep neural network with represents an agent.
"""
def __init__(self, input_size, num_actions):
super(DQN, self).__init__()
self.l1 = nn.Linear(input_size,100) ## To play with different NN architectures
self.l2 = nn.Linear(100, num_actions)

def forward(self, x):
return self.l2(F.relu(self.l1(x)))


def select_action(state, model, num_actions,
EPS_START, EPS_END, EPS_DECAY, steps_done):
"""
Selects whether the next action is choosen by our model or randomly
"""
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
math.exp(-1. * steps_done / EPS_DECAY)
if sample > eps_threshold:
return model(
Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
else:
return LongTensor([[random.randrange(num_actions)]])


def optimize_model(model, optimizer, memory, BATCH_SIZE, GAMMA):
global last_sync
if len(memory) < BATCH_SIZE:
return
transitions = memory.sample(BATCH_SIZE)
# Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
# detailed explanation).
batch = Transition(*zip(*transitions))

# Compute a mask of non-final states and concatenate the batch elements
non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
batch.next_state)))

# We don't want to backprop through the expected action values and volatile
# will save us on temporarily changing the model parameters'
# requires_grad to False!
non_final_next_states = Variable(torch.cat([s for s in batch.next_state
if s is not None]),
volatile=True)
state_batch = Variable(torch.cat(batch.state))
action_batch = Variable(torch.cat(batch.action))
reward_batch = Variable(torch.cat(batch.reward))

# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken
state_action_values = model(state_batch).gather(1, action_batch)

# Compute V(s_{t+1}) for all next states.
next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
next_state_values[non_final_mask] = model(non_final_next_states).max(1)[0]
# Now, we don't want to mess up the loss with a volatile flag, so let's
# clear it. After this, we'll just end up with a Variable that has
# requires_grad=False
next_state_values.volatile = False
# Compute the expected Q values
expected_state_action_values = (next_state_values * GAMMA) + reward_batch

# Compute Huber loss
loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

# Optimize the model
optimizer.zero_grad()
loss.backward()
for param in model.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
Loading

0 comments on commit 7279268

Please sign in to comment.