diff --git a/multi_turn_reward_for_RLHF/main.py b/multi_turn_reward_for_RLHF/main.py new file mode 100644 index 00000000000..9c9f736eee3 --- /dev/null +++ b/multi_turn_reward_for_RLHF/main.py @@ -0,0 +1,132 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np + + +class DialogueEnv: + """Multi-turn dialogue environment that simulates conversations.""" + + def __init__(self): + self.turns = 5 # Each dialogue lasts 5 turns + self.current_turn = 0 + self.conversation = [] + + def reset(self): + """Resets the environment for a new dialogue.""" + self.current_turn = 0 + self.conversation = [] + return "Hi, how can I help you today?" # Starting dialogue + + def step(self, action): + """Takes an action (a response) and advances the conversation.""" + self.conversation.append(action) + self.current_turn += 1 + + if self.current_turn < self.turns: + # Generate the next response from the environment (placeholder) + next_state = f"Response {self.current_turn}: How about this?" + done = False + reward = self._human_feedback(action) + else: + next_state = "Conversation ended." + done = True + reward = self._human_feedback(action) + + return next_state, reward, done + + def _human_feedback(self, action): + """Simulates human feedback by returning a random reward.""" + return np.random.choice([1, -1]) # 1 for positive feedback, -1 for negative + + +class PolicyNetwork(nn.Module): + """Policy network that defines the agent's behavior.""" + + def __init__(self, input_size=100, hidden_size=128, output_size=10): + super(PolicyNetwork, self).__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, output_size) + + def forward(self, x): + """Forward pass through the network.""" + x = torch.relu(self.fc1(x)) + return self.fc2(x) + + +def pad_or_truncate(state, size=100): + """Pads or truncates the input state to match the required input size.""" + state_tensor = torch.tensor([ord(c) for c in state], dtype=torch.float32) + if state_tensor.size(0) < size: + padded_tensor = torch.cat([state_tensor, torch.zeros(size - state_tensor.size(0))]) + else: + padded_tensor = state_tensor[:size] + return padded_tensor.unsqueeze(0) # Add batch dimension + + +def train_rlhf(env, model, optimizer, num_episodes=1000): + """Trains the policy network using reinforcement learning with human feedback.""" + gamma = 0.99 # Discount factor for future rewards + + for episode in range(num_episodes): + state = env.reset() + total_reward = 0 + log_probs = [] + rewards = [] + + done = False + while not done: + # Pad or truncate the input state to the required size + state_tensor = pad_or_truncate(state, size=100) + logits = model(state_tensor) + action_probs = torch.softmax(logits, dim=-1) + action_dist = torch.distributions.Categorical(action_probs) + + action = action_dist.sample() + log_prob = action_dist.log_prob(action) + log_probs.append(log_prob) + + # Take the action in the environment + action_text = f"Action {action.item()}" + next_state, reward, done = env.step(action_text) + rewards.append(reward) + total_reward += reward + + state = next_state + + # Calculate the discounted rewards + discounted_rewards = [] + cumulative_reward = 0 + for r in reversed(rewards): + cumulative_reward = r + gamma * cumulative_reward + discounted_rewards.insert(0, cumulative_reward) + + # Normalize the rewards + discounted_rewards = torch.tensor(discounted_rewards) + discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-6) + + # Policy Gradient: Update the policy + policy_loss = [] + for log_prob, reward in zip(log_probs, discounted_rewards): + policy_loss.append(-log_prob * reward) + + optimizer.zero_grad() + policy_loss = torch.cat(policy_loss).sum() + policy_loss.backward() + optimizer.step() + + print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}") + + +if __name__ == "__main__": + # Instantiate environment and model + env = DialogueEnv() + input_size = 100 # Placeholder for state size (e.g., fixed-length input of size 100) + hidden_size = 128 + output_size = 10 # Placeholder for the number of possible actions (dialogue responses) + + model = PolicyNetwork(input_size, hidden_size, output_size) + optimizer = optim.Adam(model.parameters(), lr=1e-3) + + # Train the policy using RL with Human Feedback (simulated) + train_rlhf(env, model, optimizer, num_episodes=1000)