-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added SQN0, DQL0 for vector input. To do: finish distral, tune hyperp…
…arams, play_game at utils.
- Loading branch information
Showing
27 changed files
with
1,574 additions
and
783 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import math | ||
import random | ||
import torch | ||
import torch.nn as nn | ||
import torch.optim as optim | ||
import torch.nn.functional as F | ||
from torch.autograd import Variable | ||
from memory_replay import Transition | ||
|
||
use_cuda = torch.cuda.is_available() | ||
|
||
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor | ||
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor | ||
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor | ||
Tensor = FloatTensor | ||
|
||
|
||
class DQN(nn.Module): | ||
""" | ||
Deep neural network with represents an agent. | ||
""" | ||
def __init__(self, num_actions): | ||
super(DQN, self).__init__() | ||
self.conv1 = nn.Conv2d(1, 5, kernel_size=2) | ||
self.bn1 = nn.BatchNorm2d(5) | ||
self.conv2 = nn.Conv2d(5, 10, kernel_size=3) | ||
self.bn2 = nn.BatchNorm2d(10) | ||
self.conv3 = nn.Conv2d(10, 10, kernel_size=3) | ||
self.bn3 = nn.BatchNorm2d(10) | ||
self.head = nn.Linear(200, num_actions) | ||
|
||
def forward(self, x): | ||
x = F.leaky_relu(self.bn1(self.conv1(x))) | ||
x = F.leaky_relu(self.bn2(self.conv2(x))) | ||
x = F.leaky_relu(self.bn3(self.conv3(x))) | ||
return self.head(x.view(x.size(0), -1)) | ||
|
||
# class DQN(nn.Module): | ||
# """ | ||
# Deep neural network with represents an agent. | ||
# """ | ||
# def __init__(self, num_actions): | ||
# super(DQN, self).__init__() | ||
# self.conv1 = nn.Conv2d(1, 10, kernel_size=2) | ||
# self.max_pool = nn.MaxPool2d((2,2)) | ||
# self.bn1 = nn.BatchNorm2d(10) | ||
# self.conv2 = nn.Conv2d(10, 20, kernel_size=3) | ||
# self.bn2 = nn.BatchNorm2d(20) | ||
# self.linear = nn.Linear(80, 20) | ||
# # self.bn3 = nn.BatchNorm1d(50) | ||
# self.head = nn.Linear(20, num_actions) | ||
|
||
# def forward(self, x): | ||
# x = F.leaky_relu(self.max_pool(self.bn1(self.conv1(x)))) | ||
# x = F.leaky_relu(self.bn2(self.conv2(x))) | ||
# x = F.leaky_relu(self.linear(x.view(x.size(0), -1))) | ||
# return self.head(x) | ||
|
||
def select_action(state, model, num_actions, | ||
EPS_START, EPS_END, EPS_DECAY, steps_done): | ||
""" | ||
Selects whether the next action is choosen by our model or randomly | ||
""" | ||
sample = random.random() | ||
eps_threshold = EPS_END + (EPS_START - EPS_END) * \ | ||
math.exp(-1. * steps_done / EPS_DECAY) | ||
if sample > eps_threshold: | ||
return model( | ||
Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1) | ||
else: | ||
return LongTensor([[random.randrange(num_actions)]]) | ||
|
||
|
||
def optimize_model(model, optimizer, memory, BATCH_SIZE, GAMMA, BETA): | ||
global last_sync | ||
if len(memory) < BATCH_SIZE: | ||
return | ||
transitions = memory.sample(BATCH_SIZE) | ||
# Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for | ||
# detailed explanation). | ||
batch = Transition(*zip(*transitions)) | ||
|
||
# Compute a mask of non-final states and concatenate the batch elements | ||
non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, | ||
batch.next_state))) | ||
# We don't want to backprop through the expected action values and volatile | ||
# will save us on temporarily changing the model parameters' | ||
# requires_grad to False! | ||
non_final_next_states = Variable(torch.cat([s for s in batch.next_state | ||
if s is not None]), | ||
volatile=True) | ||
state_batch = Variable(torch.cat(batch.state)) | ||
action_batch = Variable(torch.cat(batch.action)) | ||
reward_batch = Variable(torch.cat(batch.reward)) | ||
|
||
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the | ||
# columns of actions taken | ||
state_action_values = model(state_batch).gather(1, action_batch) | ||
|
||
# Compute V(s_{t+1}) for all next states. | ||
next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor)) | ||
next_state_values[non_final_mask] = torch.log( torch.exp( | ||
BETA * model(non_final_next_states)).sum(1)) / BETA | ||
# Now, we don't want to mess up the loss with a volatile flag, so let's | ||
# clear it. After this, we'll just end up with a Variable that has | ||
# requires_grad=False | ||
next_state_values.volatile = False | ||
# Compute the expected Q values | ||
expected_state_action_values = (next_state_values * GAMMA) + reward_batch | ||
|
||
# Compute Huber loss | ||
loss = F.mse_loss(state_action_values, expected_state_action_values) | ||
|
||
# Optimize the model | ||
optimizer.zero_grad() | ||
loss.backward() | ||
for param in model.parameters(): | ||
param.grad.data.clamp_(-1, 1) | ||
optimizer.step() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import random | ||
from collections import namedtuple | ||
|
||
Transition = namedtuple('Transition', | ||
('state', 'action', 'next_state', 'reward')) | ||
|
||
class ReplayMemory(object): | ||
|
||
def __init__(self, capacity): | ||
self.capacity = capacity | ||
self.memory = [] | ||
self.position = 0 | ||
|
||
def push(self, *args): | ||
"""Saves a transition.""" | ||
if len(self.memory) < self.capacity: | ||
self.memory.append(None) | ||
self.memory[self.position] = Transition(*args) | ||
self.position = (self.position + 1) % self.capacity | ||
|
||
def sample(self, batch_size): | ||
return random.sample(self.memory, batch_size) | ||
|
||
def __len__(self): | ||
return len(self.memory) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import matplotlib | ||
import matplotlib.pyplot as plt | ||
from itertools import count | ||
import torch.optim as optim | ||
import torch | ||
import math | ||
import numpy as np | ||
from memory_replay import ReplayMemory, Transition | ||
from network import DQN, select_action, optimize_model, Tensor | ||
import sys | ||
sys.path.append('../') | ||
from envs.gridworld_env import GridworldEnv | ||
from utils import plot_rewards, plot_durations, plot_state, get_screen | ||
|
||
def trainSQL(file_name="SQL", env=GridworldEnv(1), batch_size=128, | ||
gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000, | ||
is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, | ||
learning_rate=0.001, memory_replay_size=10000): | ||
""" | ||
Soft Q-learning training routine. Retuns rewards and durations logs. | ||
Plot environment screen | ||
""" | ||
if is_plot: | ||
env.reset() | ||
plt.ion() | ||
plt.figure() | ||
plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), | ||
interpolation='none') | ||
plt.draw() | ||
plt.pause(0.00001) | ||
|
||
num_actions = env.action_space.n | ||
model = DQN(num_actions) | ||
optimizer = optim.Adam(model.parameters(), lr=learning_rate) | ||
# optimizer = optim.RMSprop(model.parameters(), ) | ||
|
||
use_cuda = torch.cuda.is_available() | ||
if use_cuda: | ||
model.cuda() | ||
|
||
memory = ReplayMemory(memory_replay_size) | ||
|
||
episode_durations = [] | ||
mean_durations = [] | ||
episode_rewards = [] | ||
mean_rewards = [] | ||
|
||
steps_done, t = 0, 0 | ||
# plt.ion() | ||
for i_episode in range(num_episodes): | ||
print("Cur episode:", i_episode, "steps done:", t, | ||
"exploration factor:", eps_end + (eps_start - eps_end) * \ | ||
math.exp(-1. * steps_done / eps_decay)) | ||
# Initialize the environment and state | ||
env.reset() | ||
# last_screen = env.current_grid_map | ||
current_screen = get_screen(env) | ||
state = current_screen # - last_screen | ||
for t in count(): | ||
# Select and perform an action | ||
action = select_action(state, model, num_actions, | ||
eps_start, eps_end, eps_decay, steps_done) | ||
_, reward, done, _ = env.step(action[0, 0]) | ||
reward = Tensor([reward]) | ||
|
||
# Observe new state | ||
last_screen = current_screen | ||
current_screen = get_screen(env) | ||
if not done: | ||
next_state = current_screen # - last_screen | ||
else: | ||
next_state = None | ||
|
||
# Store the transition in memory | ||
memory.push(state, action, next_state, reward) | ||
|
||
# Move to the next state | ||
state = next_state | ||
# plot_state(state) | ||
# env.render() | ||
|
||
# Perform one step of the optimization (on the target network) | ||
optimize_model(model, optimizer, memory, batch_size, gamma, beta) | ||
if done or t + 1 >= max_num_steps_per_episode: | ||
episode_durations.append(t + 1) | ||
episode_rewards.append(env.episode_total_reward) | ||
if is_plot: | ||
plot_durations(episode_durations, mean_durations) | ||
plot_rewards(episode_rewards, mean_rewards) | ||
steps_done += 1 | ||
break | ||
|
||
print('Complete') | ||
env.render(close=True) | ||
env.close() | ||
if is_plot: | ||
plt.ioff() | ||
plt.show() | ||
|
||
## Store Results | ||
|
||
np.save(file_name + '-sql-rewards', episode_rewards) | ||
np.save(file_name + '-sql-durations', episode_durations) | ||
|
||
return model, episode_rewards, episode_durations |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import random | ||
from collections import namedtuple | ||
|
||
Transition = namedtuple('Transition', | ||
('state', 'action', 'next_state', 'reward')) | ||
|
||
class ReplayMemory(object): | ||
|
||
def __init__(self, capacity): | ||
self.capacity = capacity | ||
self.memory = [] | ||
self.position = 0 | ||
|
||
def push(self, *args): | ||
"""Saves a transition.""" | ||
if len(self.memory) < self.capacity: | ||
self.memory.append(None) | ||
self.memory[self.position] = Transition(*args) | ||
self.position = (self.position + 1) % self.capacity | ||
|
||
def sample(self, batch_size): | ||
return random.sample(self.memory, batch_size) | ||
|
||
def __len__(self): | ||
return len(self.memory) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import math | ||
import random | ||
import torch | ||
import torch.nn as nn | ||
import torch.optim as optim | ||
import torch.nn.functional as F | ||
from torch.autograd import Variable | ||
from memory_replay import Transition | ||
|
||
use_cuda = torch.cuda.is_available() | ||
|
||
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor | ||
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor | ||
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor | ||
Tensor = FloatTensor | ||
|
||
class DQN(nn.Module): | ||
""" | ||
Deep neural network with represents an agent. | ||
""" | ||
def __init__(self, input_size, num_actions): | ||
super(DQN, self).__init__() | ||
self.l1 = nn.Linear(input_size,100) ## To play with different NN architectures | ||
self.l2 = nn.Linear(100, num_actions) | ||
|
||
def forward(self, x): | ||
return self.l2(F.relu(self.l1(x))) | ||
|
||
|
||
def select_action(state, model, num_actions, | ||
EPS_START, EPS_END, EPS_DECAY, steps_done): | ||
""" | ||
Selects whether the next action is choosen by our model or randomly | ||
""" | ||
sample = random.random() | ||
eps_threshold = EPS_END + (EPS_START - EPS_END) * \ | ||
math.exp(-1. * steps_done / EPS_DECAY) | ||
if sample > eps_threshold: | ||
return model( | ||
Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1) | ||
else: | ||
return LongTensor([[random.randrange(num_actions)]]) | ||
|
||
|
||
def optimize_model(model, optimizer, memory, BATCH_SIZE, GAMMA): | ||
global last_sync | ||
if len(memory) < BATCH_SIZE: | ||
return | ||
transitions = memory.sample(BATCH_SIZE) | ||
# Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for | ||
# detailed explanation). | ||
batch = Transition(*zip(*transitions)) | ||
|
||
# Compute a mask of non-final states and concatenate the batch elements | ||
non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, | ||
batch.next_state))) | ||
|
||
# We don't want to backprop through the expected action values and volatile | ||
# will save us on temporarily changing the model parameters' | ||
# requires_grad to False! | ||
non_final_next_states = Variable(torch.cat([s for s in batch.next_state | ||
if s is not None]), | ||
volatile=True) | ||
state_batch = Variable(torch.cat(batch.state)) | ||
action_batch = Variable(torch.cat(batch.action)) | ||
reward_batch = Variable(torch.cat(batch.reward)) | ||
|
||
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the | ||
# columns of actions taken | ||
state_action_values = model(state_batch).gather(1, action_batch) | ||
|
||
# Compute V(s_{t+1}) for all next states. | ||
next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor)) | ||
next_state_values[non_final_mask] = model(non_final_next_states).max(1)[0] | ||
# Now, we don't want to mess up the loss with a volatile flag, so let's | ||
# clear it. After this, we'll just end up with a Variable that has | ||
# requires_grad=False | ||
next_state_values.volatile = False | ||
# Compute the expected Q values | ||
expected_state_action_values = (next_state_values * GAMMA) + reward_batch | ||
|
||
# Compute Huber loss | ||
loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) | ||
|
||
# Optimize the model | ||
optimizer.zero_grad() | ||
loss.backward() | ||
for param in model.parameters(): | ||
param.grad.data.clamp_(-1, 1) | ||
optimizer.step() |
Oops, something went wrong.