Skip to content

Commit

Permalink
fixed bugs & distral is workinggit status !
Browse files Browse the repository at this point in the history
  • Loading branch information
koloskova committed Mar 17, 2018
1 parent eb2c98d commit db68e9e
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 25 deletions.
14 changes: 13 additions & 1 deletion code/distral_2col/memory_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,32 @@

class ReplayMemory(object):

def __init__(self, capacity):
def __init__(self, capacity, policy_capacity):
self.capacity = capacity
self.memory = []
self.position = 0

self.policy_capacity = policy_capacity
self.policy_memory = []
self.policy_position = 0

def push(self, *args):
"""Saves a transition."""
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = Transition(*args)
self.position = (self.position + 1) % self.capacity

if len(self.policy_memory) < self.policy_capacity:
self.policy_memory.append(None)
self.policy_memory[self.policy_position] = Transition(*args)
self.policy_position = (self.policy_position + 1) % self.policy_capacity

def sample(self, batch_size):
return random.sample(self.memory, batch_size)

def policy_sample(self, batch_size):
return random.sample(self.policy_memory, batch_size)

def __len__(self):
return len(self.memory)
45 changes: 30 additions & 15 deletions code/distral_2col/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from torch.autograd import Variable
from memory_replay import Transition
from itertools import count
from torch.distributions import Categorical


use_cuda = torch.cuda.is_available()

Expand Down Expand Up @@ -41,7 +43,7 @@ class PolicyNetwork(nn.Module):
Deep neural network which represents policy network.
"""
def __init__(self, num_actions):
super(DQN, self).__init__()
super(PolicyNetwork, self).__init__()
self.conv1 = nn.Conv2d(1, 5, kernel_size=2)
self.bn1 = nn.BatchNorm2d(5)
self.conv2 = nn.Conv2d(5, 10, kernel_size=3)
Expand All @@ -52,25 +54,38 @@ def __init__(self, num_actions):
self.softmax = nn.Softmax()

def forward(self, x):
x = F.leaky_relu(self.max_pool(self.bn1(self.conv1(x))))
x = F.leaky_relu(self.bn1(self.conv1(x)))
x = F.leaky_relu(self.bn2(self.conv2(x)))
x = F.leaky_relu(self.linear(x.view(x.size(0), -1)))
return self.softmax(self.head(x))
x = F.leaky_relu(self.bn3(self.conv3(x)))
x = F.leaky_relu(self.head(x.view(x.size(0), -1)))
return self.softmax(x)

def select_action(state, model, num_actions,
EPS_START, EPS_END, EPS_DECAY, steps_done):
def select_action(state, policy, model, num_actions,
EPS_START, EPS_END, EPS_DECAY, steps_done, alpha, beta):
"""
Selects whether the next action is choosen by our model or randomly
"""
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
math.exp(-1. * steps_done / EPS_DECAY)
if sample > eps_threshold:
return model(
Variable(state, volatile=True).type(FloatTensor
)).data.max(1)[1].view(1, 1)
else:
return LongTensor([[random.randrange(num_actions)]])
# sample = random.random()
# eps_threshold = EPS_END + (EPS_START - EPS_END) * \
# math.exp(-1. * steps_done / EPS_DECAY)
# .data.max(1)[1].view(1, 1)
# if sample <= eps_threshold:
# return LongTensor([[random.randrange(num_actions)]])

Q = model(Variable(state, volatile=True).type(FloatTensor))
pi0 = policy(Variable(state, volatile=True).type(FloatTensor))
# print(pi0.data.numpy())
V = torch.log((torch.pow(pi0, alpha) * torch.exp(beta * Q)).sum(1)) / beta
pi_i = torch.pow(pi0, alpha) * torch.exp(beta * (Q - V))
# probabilities = pi_i.data.numpy()[0]
m = Categorical(pi_i)
action = m.sample().data.view(1, 1)
return action
# numpy.random.choice(numpy.arange(0, num_actions), p=probabilities)






def optimize_policy(policy, optimizer, memories, batch_size,
Expand Down
19 changes: 10 additions & 9 deletions code/distral_2col/trainingDistral2col.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@
import math
import numpy as np
from memory_replay import ReplayMemory, Transition
from network import DQN, select_action, optimize_model, Tensor, optimize_policy
from network import DQN, select_action, optimize_model, Tensor, optimize_policy, PolicyNetwork
import sys
sys.path.append('../')
from envs.gridworld_env import GridworldEnv
from utils import plot_rewards, plot_durations, plot_state, get_screen

def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=1,
beta=5, eps_start=0.9, eps_end=0.05, eps_decay=10,
is_plot=False, num_episodes=500,
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(5),
GridworldEnv(4)], batch_size=128, gamma=0.999, alpha=1,
beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5,
is_plot=False, num_episodes=200,
max_num_steps_per_episode=1000, learning_rate=0.001,
memory_replay_size=10000, memory_policy_size=1000):
"""
Expand All @@ -24,7 +24,7 @@ def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
"""
num_actions = list_of_envs[0].action_space.n
num_envs = len(list_of_envs)
policy = DQN(num_actions)
policy = PolicyNetwork(num_actions)
models = [DQN(num_actions) for _ in range(0, num_envs)]
memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)]

Expand Down Expand Up @@ -67,9 +67,9 @@ def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
current_screen = get_screen(env)
state = current_screen # - last_screen
# Select and perform an action
action = select_action(state, models[i_env], num_actions,
action = select_action(state, policy, models[i_env], num_actions,
eps_start, eps_end, eps_decay,
episodes_done[i_env])
episodes_done[i_env], alpha, beta)
steps_done[i_env] += 1
current_time[i_env] += 1
_, reward, done, _ = env.step(action[0, 0])
Expand All @@ -91,7 +91,8 @@ def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4),
optimize_model(policy, models[i_env], optimizers[i_env],
memories[i_env], batch_size, alpha, beta, gamma)
if done:
print("ENV:", i_env, "\treward:", env.episode_total_reward,
print("ENV:", i_env, "iter:", episodes_done[i_env],
"\treward:", env.episode_total_reward,
"\tit:", current_time[i_env], "\texp_factor:", eps_end +
(eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay))
env.reset()
Expand Down

0 comments on commit db68e9e

Please sign in to comment.