feat(tokenizer): new logic

AdityaNG · Feb 29, 2024 · f08c6b0 · f08c6b0
1 parent 7c113ef
commit f08c6b0
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 23 deletions.
diff --git a/drivellava/model.py b/drivellava/model.py
@@ -113,12 +113,16 @@ def run(self, query: str, image_files: List[str]):
             else:
                 qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
 
+        print("qs", qs)
+
         # Prepare conversation
         conv = conv_templates[self.conv_mode].copy()
         conv.append_message(conv.roles[0], qs)
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
 
+        print("prompt", prompt)
+
         # Process images
         images = load_images(image_files)
         image_sizes = [x.size for x in images]
@@ -135,21 +139,25 @@ def run(self, query: str, image_files: List[str]):
             .to(self.model.device)
         )
 
+        print("input_ids", input_ids)
+
         # Inference
         with torch.inference_mode():
             output_ids = self.model.generate(
                 input_ids,
                 images=images_tensor,
                 image_sizes=image_sizes,
                 do_sample=True if self.args.temperature > 0 else False,
-                temperature=self.args.temperature,
-                top_p=self.args.top_p,
+                # temperature=self.args.temperature,
+                # top_p=self.args.top_p,
                 num_beams=self.args.num_beams,
                 max_new_tokens=self.args.max_new_tokens,
                 use_cache=True,
             )
 
-        outputs = self.tokenizer.batch_decode(output_ids)
+        outputs = self.tokenizer.batch_decode(
+            output_ids, skip_special_tokens=True
+        )
         print("outputs", outputs)
 
         outputs = outputs[0]

diff --git a/drivellava/scripts/eval.py b/drivellava/scripts/eval.py
@@ -30,11 +30,10 @@ def main():
 
     # from transformers.models.llava.configuration_llava import LlavaConfig
 
-    # fine_tuned_model_path = "liuhaotian/llava-v1.5-7b"
-    fine_tuned_model_path = os.path.expanduser(
-        "~/Datasets/checkpoints/checkpoint-4000/"
-        # '~/Datasets/checkpoints/checkpoint-4000/drivellava.bin'
-    )
+    fine_tuned_model_path = "liuhaotian/llava-v1.5-7b"
+    # fine_tuned_model_path = os.path.expanduser(
+    #     "~/Datasets/checkpoints/checkpoint-1000/"
+    # )
 
     args = type(
         "Args",
@@ -50,30 +49,36 @@ def main():
             "temperature": 0,
             "top_p": None,
             "num_beams": 1,
-            "max_new_tokens": 4,
+            "max_new_tokens": 64,
         },
     )()
 
     model = DriveLLaVA(args)
 
     print(dir(model.tokenizer))
-    print(model.tokenizer.get_vocab())
+    # print(model.tokenizer.get_vocab())
 
     NUM_FRAMES = 20 * 1
 
-    encoded_video_path = "/root/Datasets/commavq/val/fe809f0fff5562cc4d2bdc073d242123_31.npy"  # noqa
+    # encoded_video_path = "/root/Datasets/commavq/val/fe809f0fff5562cc4d2bdc073d242123_31.npy"  # noqa
+    encoded_video_path = "/root/Datasets/commavq/data_0_to_2500/000e83c564317de4668c2cb372f89b91_6.npy"  # noqa
+    # encoded_video_path = "/root/Datasets/commavq/img_data_0_to_2500/000e83c564317de4668c2cb372f89b91_6.npy"  # noqa
 
     # assert os.path.isfile(encoded_video_path), encoded_video_path
 
     pose_path = encoded_video_path.replace("data_", "pose_data_").replace(
-        "val", "pose_val"
+        # pose_path = encoded_video_path.replace("img_data_", "pose_data_").replace(
+        "val",
+        "pose_val",
     )
     assert os.path.isfile(pose_path), pose_path
 
     decoded_imgs_list = []
 
     for frame_index in range(1200):
         frame_path = get_image_path(encoded_video_path, frame_index)
+        frame_path = frame_path.replace("data_", "img_data_")
+        # print('frame_path', frame_path)
         if os.path.isfile(frame_path):
             decoded_imgs_list.append(frame_path)
 
@@ -108,8 +113,19 @@ def main():
         trajectory, trajectory_encoded = pose_dataset[i]
         trajectory_quantized = trajectory_encoder.decode(trajectory_encoded)
 
+        traj_tokens = model.tokenizer.tokenize(trajectory_encoded)
+        traj_tokens_encoded = model.tokenizer.encode(trajectory_encoded)
+        print(
+            "traj_tokens",
+            trajectory_encoded,
+            "->",
+            traj_tokens,
+            "->",
+            traj_tokens_encoded,
+        )
+
         model_trajectory_quantized = model.run(
-            get_drivellava_prompt(trajectory_encoder),
+            get_drivellava_prompt(trajectory_encoder, default_image_token=""),
             [
                 decoded_imgs_list[i],
             ],

diff --git a/drivellava/scripts/train.py b/drivellava/scripts/train.py
@@ -25,6 +25,11 @@ def load_json_dataset(
             loaded = json.load(f)
             for index in range(len(loaded)):
                 assert len(loaded[index]["conversations"][1]["value"]) == 1
+
+                loaded[index]["conversations"][1]["value"] = (
+                    "Selected Trajectory: "
+                    + loaded[index]["conversations"][1]["value"]
+                )
                 loaded[index]["conversations"][0]["value"] = (
                     get_drivellava_prompt(trajectory_encoder)
                 )

diff --git a/drivellava/sparse_llava_dataset.py b/drivellava/sparse_llava_dataset.py
@@ -113,26 +113,29 @@ def visualize_pose(
         exit()
 
 
-def get_drivellava_prompt(trajectory_encoder: TrajectoryEncoder):
+def get_drivellava_prompt(
+    trajectory_encoder: TrajectoryEncoder,
+    default_image_token: str = DEFAULT_IMAGE_TOKEN,
+):
     traj_list = list(trajectory_encoder.token2trajectory.keys())
     random.shuffle(traj_list)
     traj_str = ",".join(list(map(str, traj_list)))
     P1 = (
-        f"{DEFAULT_IMAGE_TOKEN}\nYou are DriveLLaVA, a "
+        f"{default_image_token}\nYou are DriveLLaVA, a "
         + "self-driving car. You will select the "
         + "appropriate trrajectory token given the "
         + "above image as context.\n"
         + "You may select one from the "
         + f"following templates: {traj_str}"
     )
-    P2 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, the autonomous vehicle, your task is to analyze the given image and determine the optimal driving path. Choose the most suitable trajectory option from the list provided based on the visual information. {traj_str}"""  # noqa
-    P3 = f"""{DEFAULT_IMAGE_TOKEN} You are the AI system DriveLLaVA, responsible for navigating self-driving cars. With the image provided as your guide, select the correct trajectory from the options below to ensure a safe and efficient route. {traj_str}"""  # noqa
-    P4 = f"""{DEFAULT_IMAGE_TOKEN} Imagine yourself as DriveLLaVA, an advanced self-driving vehicle intelligence. Examine the scenario depicted in the image and decide on the best course of action by selecting an appropriate trajectory from the given templates. {traj_str}"""  # noqa
-    P5 = f"""{DEFAULT_IMAGE_TOKEN} You embody DriveLLaVA, the brain behind autonomous driving technology. Given the context shown in the image, it's your job to pick the right trajectory from the available choices to navigate safely. {traj_str}"""  # noqa
-    P6 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, a pioneering self-driving car AI, you're tasked with interpreting the visual cues in the provided image to choose the most suitable trajectory from the list of options to ensure a smooth journey. {traj_str}"""  # noqa
-    P7 = f"""{DEFAULT_IMAGE_TOKEN} You, as DriveLLaVA, are at the forefront of autonomous navigation. Assess the situation depicted in the image and select the trajectory that best aligns with safe and efficient driving principles from the options provided. {traj_str}"""  # noqa
-    P8 = f"""{DEFAULT_IMAGE_TOKEN} Functioning as DriveLLaVA, the self-driving car's decision-making system, you must look at the image and determine the best path forward by choosing from the predefined trajectory templates. {traj_str}"""  # noqa
-    P9 = f"""{DEFAULT_IMAGE_TOKEN} You are DriveLLaVA, an AI designed for autonomous vehicles. Your objective is to analyze the context presented in the image and select a trajectory that guarantees the safety and comfort of your passengers from the given templates. {traj_str}"""  # noqa
+    P2 = f"""{default_image_token} As DriveLLaVA, the autonomous vehicle, your task is to analyze the given image and determine the optimal driving path. Choose the most suitable trajectory option from the list provided based on the visual information. {traj_str}"""  # noqa
+    P3 = f"""{default_image_token} You are the AI system DriveLLaVA, responsible for navigating self-driving cars. With the image provided as your guide, select the correct trajectory from the options below to ensure a safe and efficient route. {traj_str}"""  # noqa
+    P4 = f"""{default_image_token} Imagine yourself as DriveLLaVA, an advanced self-driving vehicle intelligence. Examine the scenario depicted in the image and decide on the best course of action by selecting an appropriate trajectory from the given templates. {traj_str}"""  # noqa
+    P5 = f"""{default_image_token} You embody DriveLLaVA, the brain behind autonomous driving technology. Given the context shown in the image, it's your job to pick the right trajectory from the available choices to navigate safely. {traj_str}"""  # noqa
+    P6 = f"""{default_image_token} As DriveLLaVA, a pioneering self-driving car AI, you're tasked with interpreting the visual cues in the provided image to choose the most suitable trajectory from the list of options to ensure a smooth journey. {traj_str}"""  # noqa
+    P7 = f"""{default_image_token} You, as DriveLLaVA, are at the forefront of autonomous navigation. Assess the situation depicted in the image and select the trajectory that best aligns with safe and efficient driving principles from the options provided. {traj_str}"""  # noqa
+    P8 = f"""{default_image_token} Functioning as DriveLLaVA, the self-driving car's decision-making system, you must look at the image and determine the best path forward by choosing from the predefined trajectory templates. {traj_str}"""  # noqa
+    P9 = f"""{default_image_token} You are DriveLLaVA, an AI designed for autonomous vehicles. Your objective is to analyze the context presented in the image and select a trajectory that guarantees the safety and comfort of your passengers from the given templates. {traj_str}"""  # noqa
 
     return random.choice([P1, P2, P3, P4, P5, P6, P7, P8, P9])