From f08c6b0034aba23727dcf407d3441424a06eb0d4 Mon Sep 17 00:00:00 2001
From: Aditya NG <adityang5@gmail.com>
Date: Thu, 29 Feb 2024 10:47:58 +0000
Subject: [PATCH] feat(tokenizer): new logic

---
 drivellava/model.py                | 14 +++++++++---
 drivellava/scripts/eval.py         | 36 +++++++++++++++++++++---------
 drivellava/scripts/train.py        |  5 +++++
 drivellava/sparse_llava_dataset.py | 23 ++++++++++---------
 4 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/drivellava/model.py b/drivellava/model.py
index 26f8f59..fe38cb1 100644
--- a/drivellava/model.py
+++ b/drivellava/model.py
@@ -113,12 +113,16 @@ def run(self, query: str, image_files: List[str]):
             else:
                 qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
 
+        print("qs", qs)
+
         # Prepare conversation
         conv = conv_templates[self.conv_mode].copy()
         conv.append_message(conv.roles[0], qs)
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
 
+        print("prompt", prompt)
+
         # Process images
         images = load_images(image_files)
         image_sizes = [x.size for x in images]
@@ -135,6 +139,8 @@ def run(self, query: str, image_files: List[str]):
             .to(self.model.device)
         )
 
+        print("input_ids", input_ids)
+
         # Inference
         with torch.inference_mode():
             output_ids = self.model.generate(
@@ -142,14 +148,16 @@ def run(self, query: str, image_files: List[str]):
                 images=images_tensor,
                 image_sizes=image_sizes,
                 do_sample=True if self.args.temperature > 0 else False,
-                temperature=self.args.temperature,
-                top_p=self.args.top_p,
+                # temperature=self.args.temperature,
+                # top_p=self.args.top_p,
                 num_beams=self.args.num_beams,
                 max_new_tokens=self.args.max_new_tokens,
                 use_cache=True,
             )
 
-        outputs = self.tokenizer.batch_decode(output_ids)
+        outputs = self.tokenizer.batch_decode(
+            output_ids, skip_special_tokens=True
+        )
         print("outputs", outputs)
 
         outputs = outputs[0]
diff --git a/drivellava/scripts/eval.py b/drivellava/scripts/eval.py
index 7ef02f5..125734b 100644
--- a/drivellava/scripts/eval.py
+++ b/drivellava/scripts/eval.py
@@ -30,11 +30,10 @@ def main():
 
     # from transformers.models.llava.configuration_llava import LlavaConfig
 
-    # fine_tuned_model_path = "liuhaotian/llava-v1.5-7b"
-    fine_tuned_model_path = os.path.expanduser(
-        "~/Datasets/checkpoints/checkpoint-4000/"
-        # '~/Datasets/checkpoints/checkpoint-4000/drivellava.bin'
-    )
+    fine_tuned_model_path = "liuhaotian/llava-v1.5-7b"
+    # fine_tuned_model_path = os.path.expanduser(
+    #     "~/Datasets/checkpoints/checkpoint-1000/"
+    # )
 
     args = type(
         "Args",
@@ -50,23 +49,27 @@ def main():
             "temperature": 0,
             "top_p": None,
             "num_beams": 1,
-            "max_new_tokens": 4,
+            "max_new_tokens": 64,
         },
     )()
 
     model = DriveLLaVA(args)
 
     print(dir(model.tokenizer))
-    print(model.tokenizer.get_vocab())
+    # print(model.tokenizer.get_vocab())
 
     NUM_FRAMES = 20 * 1
 
-    encoded_video_path = "/root/Datasets/commavq/val/fe809f0fff5562cc4d2bdc073d242123_31.npy"  # noqa
+    # encoded_video_path = "/root/Datasets/commavq/val/fe809f0fff5562cc4d2bdc073d242123_31.npy"  # noqa
+    encoded_video_path = "/root/Datasets/commavq/data_0_to_2500/000e83c564317de4668c2cb372f89b91_6.npy"  # noqa
+    # encoded_video_path = "/root/Datasets/commavq/img_data_0_to_2500/000e83c564317de4668c2cb372f89b91_6.npy"  # noqa
 
     # assert os.path.isfile(encoded_video_path), encoded_video_path
 
     pose_path = encoded_video_path.replace("data_", "pose_data_").replace(
-        "val", "pose_val"
+        # pose_path = encoded_video_path.replace("img_data_", "pose_data_").replace(
+        "val",
+        "pose_val",
     )
     assert os.path.isfile(pose_path), pose_path
 
@@ -74,6 +77,8 @@ def main():
 
     for frame_index in range(1200):
         frame_path = get_image_path(encoded_video_path, frame_index)
+        frame_path = frame_path.replace("data_", "img_data_")
+        # print('frame_path', frame_path)
         if os.path.isfile(frame_path):
             decoded_imgs_list.append(frame_path)
 
@@ -108,8 +113,19 @@ def main():
         trajectory, trajectory_encoded = pose_dataset[i]
         trajectory_quantized = trajectory_encoder.decode(trajectory_encoded)
 
+        traj_tokens = model.tokenizer.tokenize(trajectory_encoded)
+        traj_tokens_encoded = model.tokenizer.encode(trajectory_encoded)
+        print(
+            "traj_tokens",
+            trajectory_encoded,
+            "->",
+            traj_tokens,
+            "->",
+            traj_tokens_encoded,
+        )
+
         model_trajectory_quantized = model.run(
-            get_drivellava_prompt(trajectory_encoder),
+            get_drivellava_prompt(trajectory_encoder, default_image_token=""),
             [
                 decoded_imgs_list[i],
             ],
diff --git a/drivellava/scripts/train.py b/drivellava/scripts/train.py
index 21664ac..d4ace88 100644
--- a/drivellava/scripts/train.py
+++ b/drivellava/scripts/train.py
@@ -25,6 +25,11 @@ def load_json_dataset(
             loaded = json.load(f)
             for index in range(len(loaded)):
                 assert len(loaded[index]["conversations"][1]["value"]) == 1
+
+                loaded[index]["conversations"][1]["value"] = (
+                    "Selected Trajectory: "
+                    + loaded[index]["conversations"][1]["value"]
+                )
                 loaded[index]["conversations"][0]["value"] = (
                     get_drivellava_prompt(trajectory_encoder)
                 )
diff --git a/drivellava/sparse_llava_dataset.py b/drivellava/sparse_llava_dataset.py
index 15b1a2e..32a4824 100644
--- a/drivellava/sparse_llava_dataset.py
+++ b/drivellava/sparse_llava_dataset.py
@@ -113,26 +113,29 @@ def visualize_pose(
         exit()
 
 
-def get_drivellava_prompt(trajectory_encoder: TrajectoryEncoder):
+def get_drivellava_prompt(
+    trajectory_encoder: TrajectoryEncoder,
+    default_image_token: str = DEFAULT_IMAGE_TOKEN,
+):
     traj_list = list(trajectory_encoder.token2trajectory.keys())
     random.shuffle(traj_list)
     traj_str = ",".join(list(map(str, traj_list)))
     P1 = (
-        f"{DEFAULT_IMAGE_TOKEN}\nYou are DriveLLaVA, a "
+        f"{default_image_token}\nYou are DriveLLaVA, a "
         + "self-driving car. You will select the "
         + "appropriate trrajectory token given the "
         + "above image as context.\n"
         + "You may select one from the "
         + f"following templates: {traj_str}"
     )
-    P2 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, the autonomous vehicle, your task is to analyze the given image and determine the optimal driving path. Choose the most suitable trajectory option from the list provided based on the visual information. {traj_str}"""  # noqa
-    P3 = f"""{DEFAULT_IMAGE_TOKEN} You are the AI system DriveLLaVA, responsible for navigating self-driving cars. With the image provided as your guide, select the correct trajectory from the options below to ensure a safe and efficient route. {traj_str}"""  # noqa
-    P4 = f"""{DEFAULT_IMAGE_TOKEN} Imagine yourself as DriveLLaVA, an advanced self-driving vehicle intelligence. Examine the scenario depicted in the image and decide on the best course of action by selecting an appropriate trajectory from the given templates. {traj_str}"""  # noqa
-    P5 = f"""{DEFAULT_IMAGE_TOKEN} You embody DriveLLaVA, the brain behind autonomous driving technology. Given the context shown in the image, it's your job to pick the right trajectory from the available choices to navigate safely. {traj_str}"""  # noqa
-    P6 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, a pioneering self-driving car AI, you're tasked with interpreting the visual cues in the provided image to choose the most suitable trajectory from the list of options to ensure a smooth journey. {traj_str}"""  # noqa
-    P7 = f"""{DEFAULT_IMAGE_TOKEN} You, as DriveLLaVA, are at the forefront of autonomous navigation. Assess the situation depicted in the image and select the trajectory that best aligns with safe and efficient driving principles from the options provided. {traj_str}"""  # noqa
-    P8 = f"""{DEFAULT_IMAGE_TOKEN} Functioning as DriveLLaVA, the self-driving car's decision-making system, you must look at the image and determine the best path forward by choosing from the predefined trajectory templates. {traj_str}"""  # noqa
-    P9 = f"""{DEFAULT_IMAGE_TOKEN} You are DriveLLaVA, an AI designed for autonomous vehicles. Your objective is to analyze the context presented in the image and select a trajectory that guarantees the safety and comfort of your passengers from the given templates. {traj_str}"""  # noqa
+    P2 = f"""{default_image_token} As DriveLLaVA, the autonomous vehicle, your task is to analyze the given image and determine the optimal driving path. Choose the most suitable trajectory option from the list provided based on the visual information. {traj_str}"""  # noqa
+    P3 = f"""{default_image_token} You are the AI system DriveLLaVA, responsible for navigating self-driving cars. With the image provided as your guide, select the correct trajectory from the options below to ensure a safe and efficient route. {traj_str}"""  # noqa
+    P4 = f"""{default_image_token} Imagine yourself as DriveLLaVA, an advanced self-driving vehicle intelligence. Examine the scenario depicted in the image and decide on the best course of action by selecting an appropriate trajectory from the given templates. {traj_str}"""  # noqa
+    P5 = f"""{default_image_token} You embody DriveLLaVA, the brain behind autonomous driving technology. Given the context shown in the image, it's your job to pick the right trajectory from the available choices to navigate safely. {traj_str}"""  # noqa
+    P6 = f"""{default_image_token} As DriveLLaVA, a pioneering self-driving car AI, you're tasked with interpreting the visual cues in the provided image to choose the most suitable trajectory from the list of options to ensure a smooth journey. {traj_str}"""  # noqa
+    P7 = f"""{default_image_token} You, as DriveLLaVA, are at the forefront of autonomous navigation. Assess the situation depicted in the image and select the trajectory that best aligns with safe and efficient driving principles from the options provided. {traj_str}"""  # noqa
+    P8 = f"""{default_image_token} Functioning as DriveLLaVA, the self-driving car's decision-making system, you must look at the image and determine the best path forward by choosing from the predefined trajectory templates. {traj_str}"""  # noqa
+    P9 = f"""{default_image_token} You are DriveLLaVA, an AI designed for autonomous vehicles. Your objective is to analyze the context presented in the image and select a trajectory that guarantees the safety and comfort of your passengers from the given templates. {traj_str}"""  # noqa
 
     return random.choice([P1, P2, P3, P4, P5, P6, P7, P8, P9])