feat(tokenizer): revised logic

AdityaNG · Feb 28, 2024 · 7c113ef · 7c113ef
1 parent 69f1fad
commit 7c113ef
Show file tree

Hide file tree

Showing 7 changed files with 32,094 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -63,6 +63,15 @@ python3 -m drivellava.scripts.generate_sparse_llava_dataset
 BNB_CUDA_VERSION=118 python3 -m drivellava.scripts.train
 ```
 
+Merge the model
+```bash
+cd LLaVA/
+python scripts/merge_lora_weights.py \
+  --model-path /path/to/lora_model \
+  --model-base /path/to/base_model \
+  --save-model-path
+```
+
 Setup the docker container for training
 ```bash
 docker compose run dev

diff --git a/drivellava/constants.py b/drivellava/constants.py
@@ -19,6 +19,8 @@ def __getitem__(self, index):
             )
 
 
+LLAVA_PATH = os.path.abspath("./LLaVA")
+
 COMMAVQ_DIR = os.path.expanduser("~/Datasets/commavq")
 
 # List of all the videos
@@ -53,6 +55,8 @@ def __getitem__(self, index):
 
 ENCODED_JSON_ALL = ENCODED_JSON + VAL_ENCODED_JSON
 
+VOCAB_JSON = os.path.abspath(os.path.join("media", "vocab.json"))
+
 
 def get_image_path(encoded_video_path: str, index: int) -> str:
     return os.path.join(

diff --git a/drivellava/model.py b/drivellava/model.py
@@ -1,4 +1,3 @@
-import os
 import re
 import sys
 from io import BytesIO
@@ -8,10 +7,7 @@
 import torch
 from PIL import Image
 
-
-def image_parser(args):
-    out = args.image_file.split(args.sep)
-    return out
+from drivellava.constants import LLAVA_PATH
 
 
 def load_image(image_file):
@@ -34,26 +30,28 @@ def load_images(image_files):
 class DriveLLaVA:
     def __init__(self, args):
 
-        LLAVA_PATH = os.path.abspath("./LLaVA")
-
         if LLAVA_PATH not in sys.path:
             sys.path.append(LLAVA_PATH)
 
-        from llava.mm_utils import get_model_name_from_path
+        # from llava.mm_utils import get_model_name_from_path
         from llava.model.builder import load_pretrained_model
         from llava.utils import disable_torch_init
 
         # Model Initialization
         # Assuming this function disables initialization in PyTorch
         disable_torch_init()
 
-        self.model_name = get_model_name_from_path(args.model_path)
+        # self.model_name = get_model_name_from_path(args.model_path)
+        self.model_name = "liuhaotian/llava-v1.5-7b"
+        # self.model_name = "llava_llama_2"
+
+        print("model_name", self.model_name)
         self.tokenizer, self.model, self.image_processor, self.context_len = (
             load_pretrained_model(
                 args.model_path,
                 args.model_base,
                 self.model_name,
-                load_8bit=True,
+                load_8bit=False,
             )
         )
 
@@ -71,6 +69,8 @@ def __init__(self, args):
         else:
             self.conv_mode = "llava_v0"
 
+        # self.conv_mode = "llava_llama_2"
+
         if args.conv_mode is not None and self.conv_mode != args.conv_mode:
             print(
                 f"[WARNING] the auto inferred conversation mode is "
@@ -120,7 +120,6 @@ def run(self, query: str, image_files: List[str]):
         prompt = conv.get_prompt()
 
         # Process images
-        # image_files = image_parser(self.args)
         images = load_images(image_files)
         image_sizes = [x.size for x in images]
         images_tensor = process_images(
@@ -150,9 +149,9 @@ def run(self, query: str, image_files: List[str]):
                 use_cache=True,
             )
 
-        outputs = self.tokenizer.batch_decode(
-            output_ids, skip_special_tokens=True
-        )[0].strip()
-        print(outputs)
+        outputs = self.tokenizer.batch_decode(output_ids)
+        print("outputs", outputs)
+
+        outputs = outputs[0]
 
         return outputs
diff --git a/drivellava/scripts/eval.py b/drivellava/scripts/eval.py
@@ -21,10 +21,20 @@
 )
 from drivellava.utils import plot_bev_trajectory, plot_steering_traj
 
+# import sys
+
 
 def main():
 
-    fine_tuned_model_path = "liuhaotian/llava-v1.5-7b"
+    # sys.path.append(LLAVA_PATH)
+
+    # from transformers.models.llava.configuration_llava import LlavaConfig
+
+    # fine_tuned_model_path = "liuhaotian/llava-v1.5-7b"
+    fine_tuned_model_path = os.path.expanduser(
+        "~/Datasets/checkpoints/checkpoint-4000/"
+        # '~/Datasets/checkpoints/checkpoint-4000/drivellava.bin'
+    )
 
     args = type(
         "Args",
@@ -34,18 +44,21 @@ def main():
             "model_base": None,
             # "model_name": get_model_name_from_path(fine_tuned_model_path),
             # "query": prompt,
-            "conv_mode": None,
+            "conv_mode": "llava_llama_2",
             # "image_file": image_file,
             # "sep": ",",
             "temperature": 0,
             "top_p": None,
             "num_beams": 1,
-            "max_new_tokens": 512,
+            "max_new_tokens": 4,
         },
     )()
 
     model = DriveLLaVA(args)
 
+    print(dir(model.tokenizer))
+    print(model.tokenizer.get_vocab())
+
     NUM_FRAMES = 20 * 1
 
     encoded_video_path = "/root/Datasets/commavq/val/fe809f0fff5562cc4d2bdc073d242123_31.npy"  # noqa
@@ -79,6 +92,10 @@ def main():
         trajectory_encoder=trajectory_encoder,
     )
 
+    # Save to video
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = None
+
     # Iterate over the embeddings in batches and decode the images
     for i in tqdm(
         range(0, len(decoded_imgs_list) - NUM_FRAMES, 1),
@@ -97,24 +114,16 @@ def main():
                 decoded_imgs_list[i],
             ],
         )
+        print(
+            "model_trajectory_quantized",
+            len(model_trajectory_quantized),
+            model_trajectory_quantized,
+        )
         model_trajectory_quantized = model_trajectory_quantized[0]
-        print("Model Trajectory Token: ", model_trajectory_quantized)
         model_trajectory_quantized = trajectory_encoder.decode(
             model_trajectory_quantized
         )
 
-        print(
-            "trajectory[0]",
-            (np.min(trajectory[:, 0]), np.max(trajectory[:, 0])),
-        )
-        print(
-            "trajectory[1]",
-            (np.min(trajectory[:, 1]), np.max(trajectory[:, 1])),
-        )
-        print(
-            "trajectory[2]",
-            (np.min(trajectory[:, 2]), np.max(trajectory[:, 2])),
-        )
         dx = trajectory[1:, 2] - trajectory[:-1, 2]
         speed = dx / (1.0 / 20.0)
         # m/s to km/h
@@ -140,14 +149,18 @@ def main():
             color=(0, 0, 255),
         )
 
-        img_bev = plot_bev_trajectory(trajectory, img, color=(255, 0, 0))
-        img_bev = plot_bev_trajectory(
+        img_bev_gt = plot_bev_trajectory(trajectory, img, color=(255, 0, 0))
+        img_bev_gtq = plot_bev_trajectory(
             trajectory_quantized, img, color=(0, 255, 0)
         )
-        img_bev = plot_bev_trajectory(
+        img_bev_pred = plot_bev_trajectory(
             model_trajectory_quantized, img, color=(0, 0, 255)
         )
 
+        # Overlay BEVs
+        img_bev = cv2.addWeighted(img_bev_gt, 0.5, img_bev_gtq, 0.5, 0)
+        img_bev = cv2.addWeighted(img_bev, 0.5, img_bev_pred, 0.5, 0)
+
         # Write speed on img
         font = cv2.FONT_HERSHEY_SIMPLEX
         bottomLeftCornerOfText = (10, 50)
@@ -170,9 +183,15 @@ def main():
 
         vis = np.concatenate([img, img_bev], axis=1)
 
+        if out is None:
+            out = cv2.VideoWriter(
+                "test_media/trajectory.mp4", fourcc, 20.0, vis.shape[1::-1]
+            )
+
+        out.write(vis)
         cv2.imwrite("test_media/vis.png", vis)
 
-        exit()
+    out.release()
 
 
 if __name__ == "__main__":

diff --git a/drivellava/sparse_llava_dataset.py b/drivellava/sparse_llava_dataset.py
@@ -15,6 +15,7 @@
 from drivellava.constants import (
     COMMAVQ_DIR,
     DECODER_ONNX_PATH,
+    LLAVA_PATH,
     get_image_path,
     get_json,
 )
@@ -33,7 +34,6 @@
     plot_steering_traj,
 )
 
-LLAVA_PATH = os.path.abspath("./LLaVA")
 if LLAVA_PATH not in sys.path:
     sys.path.append(LLAVA_PATH)
 
@@ -125,14 +125,14 @@ def get_drivellava_prompt(trajectory_encoder: TrajectoryEncoder):
         + "You may select one from the "
         + f"following templates: {traj_str}"
     )
-    P2 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, the autonomous vehicle, your task is to analyze the given image and determine the optimal driving path. Choose the most suitable trajectory option from the list provided based on the visual information. {traj_str}"""
-    P3 = f"""{DEFAULT_IMAGE_TOKEN} You are the AI system DriveLLaVA, responsible for navigating self-driving cars. With the image provided as your guide, select the correct trajectory from the options below to ensure a safe and efficient route. {traj_str}"""
-    P4 = f"""{DEFAULT_IMAGE_TOKEN} Imagine yourself as DriveLLaVA, an advanced self-driving vehicle intelligence. Examine the scenario depicted in the image and decide on the best course of action by selecting an appropriate trajectory from the given templates. {traj_str}"""
-    P5 = f"""{DEFAULT_IMAGE_TOKEN} You embody DriveLLaVA, the brain behind autonomous driving technology. Given the context shown in the image, it's your job to pick the right trajectory from the available choices to navigate safely. {traj_str}"""
-    P6 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, a pioneering self-driving car AI, you're tasked with interpreting the visual cues in the provided image to choose the most suitable trajectory from the list of options to ensure a smooth journey. {traj_str}"""
-    P7 = f"""{DEFAULT_IMAGE_TOKEN} You, as DriveLLaVA, are at the forefront of autonomous navigation. Assess the situation depicted in the image and select the trajectory that best aligns with safe and efficient driving principles from the options provided. {traj_str}"""
-    P8 = f"""{DEFAULT_IMAGE_TOKEN} Functioning as DriveLLaVA, the self-driving car's decision-making system, you must look at the image and determine the best path forward by choosing from the predefined trajectory templates. {traj_str}"""
-    P9 = f"""{DEFAULT_IMAGE_TOKEN} You are DriveLLaVA, an AI designed for autonomous vehicles. Your objective is to analyze the context presented in the image and select a trajectory that guarantees the safety and comfort of your passengers from the given templates. {traj_str}"""
+    P2 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, the autonomous vehicle, your task is to analyze the given image and determine the optimal driving path. Choose the most suitable trajectory option from the list provided based on the visual information. {traj_str}"""  # noqa
+    P3 = f"""{DEFAULT_IMAGE_TOKEN} You are the AI system DriveLLaVA, responsible for navigating self-driving cars. With the image provided as your guide, select the correct trajectory from the options below to ensure a safe and efficient route. {traj_str}"""  # noqa
+    P4 = f"""{DEFAULT_IMAGE_TOKEN} Imagine yourself as DriveLLaVA, an advanced self-driving vehicle intelligence. Examine the scenario depicted in the image and decide on the best course of action by selecting an appropriate trajectory from the given templates. {traj_str}"""  # noqa
+    P5 = f"""{DEFAULT_IMAGE_TOKEN} You embody DriveLLaVA, the brain behind autonomous driving technology. Given the context shown in the image, it's your job to pick the right trajectory from the available choices to navigate safely. {traj_str}"""  # noqa
+    P6 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, a pioneering self-driving car AI, you're tasked with interpreting the visual cues in the provided image to choose the most suitable trajectory from the list of options to ensure a smooth journey. {traj_str}"""  # noqa
+    P7 = f"""{DEFAULT_IMAGE_TOKEN} You, as DriveLLaVA, are at the forefront of autonomous navigation. Assess the situation depicted in the image and select the trajectory that best aligns with safe and efficient driving principles from the options provided. {traj_str}"""  # noqa
+    P8 = f"""{DEFAULT_IMAGE_TOKEN} Functioning as DriveLLaVA, the self-driving car's decision-making system, you must look at the image and determine the best path forward by choosing from the predefined trajectory templates. {traj_str}"""  # noqa
+    P9 = f"""{DEFAULT_IMAGE_TOKEN} You are DriveLLaVA, an AI designed for autonomous vehicles. Your objective is to analyze the context presented in the image and select a trajectory that guarantees the safety and comfort of your passengers from the given templates. {traj_str}"""  # noqa
 
     return random.choice([P1, P2, P3, P4, P5, P6, P7, P8, P9])
 

diff --git a/drivellava/trajectory_encoder.py b/drivellava/trajectory_encoder.py
@@ -1,16 +1,19 @@
+import json
 import os
 import pickle
 from typing import List
 
 import numpy as np
 
+from drivellava.constants import VOCAB_JSON
+
 NUM_TRAJECTORY_TEMPLATES = 256
 TRAJECTORY_SIZE = 20
 TRAJECTORY_TEMPLATES_NPY = f"./trajectory_templates/proposed_trajectory_templates_{NUM_TRAJECTORY_TEMPLATES}.npy"  # noqa
 TRAJECTORY_TEMPLATES_KMEANS_PKL = (
     f"./trajectory_templates/kmeans_{NUM_TRAJECTORY_TEMPLATES}.pkl"
 )
-ENCODING = "UTF-8"
+ENCODING = "utf-8"
 
 
 class TrajectoryEncoder:
@@ -21,12 +24,18 @@ def __init__(
         trajectory_size=TRAJECTORY_SIZE,
         trajectory_templates_npy=TRAJECTORY_TEMPLATES_NPY,
         trajectory_templates_kmeans_pkl=TRAJECTORY_TEMPLATES_KMEANS_PKL,
+        vocab_json=VOCAB_JSON,
     ) -> None:
         self.num_trajectory_templates = num_trajectory_templates
         self.trajectory_templates_npy = trajectory_templates_npy
         self.trajectory_templates_kmeans_pkl = trajectory_templates_kmeans_pkl
         self.trajectory_size = trajectory_size
 
+        with open(vocab_json, "r", encoding=ENCODING) as f:
+            self.vocab_json = json.load(f)
+
+        self.vocab_json_inv = {v: k for k, v in self.vocab_json.items()}
+
         assert os.path.exists(
             trajectory_templates_npy
         ), f"File {trajectory_templates_npy} does not exist"
@@ -52,7 +61,13 @@ def __init__(
         with open(trajectory_templates_kmeans_pkl, "rb") as f:
             self.kmeans = pickle.load(f)
 
-        self.TOKEN_IDS: List[str] = []
+        self.start_token_id = 31500
+        self.end_token_id = self.start_token_id + self.num_trajectory_templates
+
+        self.TOKEN_IDS: List[str] = [
+            self.vocab_json_inv[i + self.start_token_id]
+            for i in range(self.num_trajectory_templates)
+        ]
 
         index = 0