Environment terminated after 4 sec #114

Readyaddy · 2024-07-21T17:49:16Z

Readyaddy
Jul 21, 2024

I am trying to use tmrl as gym. And i am trying to train my model on a custom map but after 4 secs environment terminates itself randomly. This is the code that i have been using.
""import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tmrl import get_environment
from collections import deque
import random
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class DQN(nn.Module):
def init(self, input_dim, output_dim):
super(DQN, self).init()
self.fc1 = nn.Linear(input_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, output_dim)

def forward(self, x):
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    return self.fc3(x)

class DQNAgent:
def init(self, state_shape, action_space):
self.state_shape = state_shape
self.action_space = action_space
self.memory = deque(maxlen=10000)
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.model = DQN(state_shape[0], action_space).to(device)
self.target_model = DQN(state_shape[0], action_space).to(device)
self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
self.update_target_model()

def update_target_model(self):
    self.target_model.load_state_dict(self.model.state_dict())

def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

def act(self, state):
    if np.random.rand() <= self.epsilon:
        return np.random.randint(self.action_space)
    state = torch.FloatTensor(state).unsqueeze(0).to(device)
    with torch.no_grad():
        act_values = self.model(state)
    return torch.argmax(act_values).item()

def replay(self, batch_size):
    minibatch = random.sample(self.memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*minibatch)

    states = torch.FloatTensor(states).to(device)
    actions = torch.LongTensor(actions).to(device)
    rewards = torch.FloatTensor(rewards).to(device)
    next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.FloatTensor(dones).to(device)

    current_q = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    max_next_q = self.target_model(next_states).max(1)[0]
    expected_q = rewards + (1 - dones) * self.gamma * max_next_q

    loss = nn.MSELoss()(current_q, expected_q.detach())

    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

    if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay

def preprocess_obs(obs):
speed = obs[0]
lidar = obs[1].flatten()
prev_actions = obs[2]
return np.concatenate([speed, lidar, prev_actions])

def action_to_env_action(action):
if action == 0:
return np.array([1.0, 0.0, -1.0]) # Full throttle, turn left
elif action == 1:
return np.array([1.0, 0.0, 0.0]) # Full throttle, go straight
else:
return np.array([1.0, 0.0, 1.0]) # Full throttle, turn right

def calculate_speed_reward(speed):
return speed * 0.1 # Scale the reward to be smaller than the main rewards

env = get_environment()
state_shape = (1 + 4*19 + 3,) # speed + flattened LIDAR + previous actions
action_space = 3 # Left, Straight, Right

agent = DQNAgent(state_shape, action_space)
batch_size = 32
n_episodes = 1000
time_limit = 300 # 30 seconds time limit

for e in range(n_episodes):
obs, info = env.reset()
state = preprocess_obs(obs)
total_reward = 0
done = False
start_time = time.time()
step_count = 0
episode_speeds = []

while not done:
    action = agent.act(state)
    env_action = action_to_env_action(action)
    next_obs, reward, terminated, truncated, info = env.step(env_action)
    
    step_count += 1
    current_time = time.time()
    elapsed_time = current_time - start_time
    
    speed = next_obs[0]
    episode_speeds.append(speed)
    
    speed_reward = calculate_speed_reward(speed)
    if elapsed_time > time_limit:
        print(f"Time limit exceeded. Elapsed time: {elapsed_time:.2f}s")
        done = False
        reward = 0
    else:
        done = terminated or truncated
        if done:
            if terminated:
                print(f"Episode terminated (e.g., crash). Elapsed time: {elapsed_time:.2f}s")
                reward = -1
            else:
                print(f"Episode truncated (e.g., completed). Elapsed time: {elapsed_time:.2f}s")
                reward = 1
        else:
            reward = 0
    
    reward += speed_reward
    
    next_state = preprocess_obs(next_obs)
    
    agent.remember(state, action, reward, next_state, done)
    state = next_state
    total_reward += reward

    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
    
    if elapsed_time > time_limit:
        break
    
    time.sleep(0.01)

if e % 10 == 0:
    agent.update_target_model()

episode_time = time.time() - start_time
average_speed = np.mean(episode_speeds) if episode_speeds else 0

# Modified print statement to handle potential NumPy arrays
print(f"Episode: {e}/{n_episodes}, Total Reward: {float(total_reward):.2f}, Epsilon: {agent.epsilon:.2f}, "
      f"Time: {episode_time:.2f}s, Steps: {step_count}, Avg Speed: {float(average_speed):.2f}")

time.sleep(0.1)

torch.save(agent.model.state_dict(), 'tmrl_dqn_model.pth')

print("Training finished.")""

And here's the config file that i have used . I am not able to find why the environment terminated itself after a fixed time.
""{
"RUN_NAME": "test",
"RESET_TRAINING": true,
"BUFFERS_MAXLEN": 5000000,
"RW_MAX_SAMPLES_PER_EPISODE": 10000,
"CUDA_TRAINING": true,
"CUDA_INFERENCE": false,
"VIRTUAL_GAMEPAD": true,
"DCAC": false,
"LOCALHOST_WORKER": true,
"LOCALHOST_TRAINER": true,
"PUBLIC_IP_SERVER": "0.0.0.0",
"PASSWORD": "==>TMRL@UseASecurePasswordHere!<==",
"TLS": false,
"TLS_HOSTNAME": "default",
"TLS_CREDENTIALS_DIRECTORY": "",
"NB_WORKERS": -1,
"WANDB_PROJECT": "tmrl",
"WANDB_ENTITY": "tmrl",
"WANDB_KEY": "df28d4daa98d2df2557d74caf78e40c68adaf288",
"PORT": 55555,
"LOCAL_PORT_SERVER": 55556,
"LOCAL_PORT_TRAINER": 55557,
"LOCAL_PORT_WORKER": 55558,
"BUFFER_SIZE": 536870912,
"HEADER_SIZE": 12,
"SOCKET_TIMEOUT_CONNECT_TRAINER": 300.0,
"SOCKET_TIMEOUT_ACCEPT_TRAINER": 300.0,
"SOCKET_TIMEOUT_CONNECT_ROLLOUT": 300.0,
"SOCKET_TIMEOUT_ACCEPT_ROLLOUT": 300.0,
"SOCKET_TIMEOUT_COMMUNICATE": 30.0,
"SELECT_TIMEOUT_OUTBOUND": 30.0,
"ACK_TIMEOUT_WORKER_TO_SERVER": 300.0,
"ACK_TIMEOUT_TRAINER_TO_SERVER": 300.0,
"ACK_TIMEOUT_SERVER_TO_WORKER": 300.0,
"ACK_TIMEOUT_SERVER_TO_TRAINER": 7200.0,
"RECV_TIMEOUT_TRAINER_FROM_SERVER": 7200.0,
"RECV_TIMEOUT_WORKER_FROM_SERVER": 600.0,
"WAIT_BEFORE_RECONNECTION": 10.0,
"LOOP_SLEEP_TIME": 1.0,
"MAX_EPOCHS": 10000,
"ROUNDS_PER_EPOCH": 100,
"TRAINING_STEPS_PER_ROUND": 200,
"MAX_TRAINING_STEPS_PER_ENVIRONMENT_STEP": 4.0,
"ENVIRONMENT_STEPS_BEFORE_TRAINING": 1000,
"UPDATE_MODEL_INTERVAL": 200,
"UPDATE_BUFFER_INTERVAL": 200,
"SAVE_MODEL_EVERY": 0,
"MEMORY_SIZE": 1000000,
"BATCH_SIZE": 256,
"ALG": {
"ALGORITHM": "SAC",
"LEARN_ENTROPY_COEF":false,
"LR_ACTOR":0.00001,
"LR_CRITIC":0.00005,
"LR_ENTROPY":0.0003,
"GAMMA":0.995,
"POLYAK":0.995,
"TARGET_ENTROPY":-0.5,
"ALPHA":0.01,
"REDQ_N":10,
"REDQ_M":2,
"REDQ_Q_UPDATES_PER_POLICY_UPDATE":20,
"OPTIMIZER_ACTOR": "adam",
"OPTIMIZER_CRITIC": "adam",
"BETAS_ACTOR": [0.997, 0.997],
"BETAS_CRITIC": [0.997, 0.997],
"L2_ACTOR": 0.0,
"L2_CRITIC": 0.0
},
"ENV": {
"RTGYM_INTERFACE": "TM20LIDAR",
"WINDOW_WIDTH": 958,
"WINDOW_HEIGHT": 488,
"SLEEP_TIME_AT_RESET": 1.5,
"IMG_HIST_LEN": 4,
"RTGYM_CONFIG": {
"time_step_duration": 0.05,
"start_obs_capture": 0.04,
"time_step_timeout_factor": 10.0,
"act_buf_len": 2,
"benchmark": false,
"wait_on_done": true,
"ep_max_length": 1000
},
"REWARD_CONFIG": {
"END_OF_TRACK": 100.0,
"CONSTANT_PENALTY": 0.0,
"CHECK_FORWARD": 500,
"CHECK_BACKWARD": 10,
"FAILURE_COUNTDOWN": 10,
"MIN_STEPS": 70,
"MAX_STRAY": 100.0
}
},
"VERSION": "0.6.0"
}""

yannbouteiller · 2024-07-21T21:44:05Z

yannbouteiller
Jul 21, 2024
Maintainer

Hello,

The 4s random termination makes me guess that you haven't recorded a custom reward function for your custom map?

This is done by executing

python -m tmrl --record-reward

and driving to the finish line.

Then you can do

python -m tmrl --ckeck-environment

and drive manually to check that the printed rewards make sense.

1 reply

yannbouteiller Jul 21, 2024
Maintainer

Also note that if the car doesn't progress for 0.5s, the episode terminates automatically

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Environment terminated after 4 sec #114

{{title}}

Replies: 1 comment 1 reply

{{title}}

{{title}}

Select a reply

Environment terminated after 4 sec #114

Readyaddy Jul 21, 2024

Replies: 1 comment · 1 reply

yannbouteiller Jul 21, 2024 Maintainer

yannbouteiller Jul 21, 2024 Maintainer

Readyaddy
Jul 21, 2024

Replies: 1 comment 1 reply

yannbouteiller
Jul 21, 2024
Maintainer

yannbouteiller Jul 21, 2024
Maintainer