Skip to content

Commit

Permalink
feat(tokenizer): revised logic
Browse files Browse the repository at this point in the history
  • Loading branch information
AdityaNG committed Feb 28, 2024
1 parent 69f1fad commit 7c113ef
Show file tree
Hide file tree
Showing 7 changed files with 32,094 additions and 46 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ python3 -m drivellava.scripts.generate_sparse_llava_dataset
BNB_CUDA_VERSION=118 python3 -m drivellava.scripts.train
```

Merge the model
```bash
cd LLaVA/
python scripts/merge_lora_weights.py \
--model-path /path/to/lora_model \
--model-base /path/to/base_model \
--save-model-path
```

Setup the docker container for training
```bash
docker compose run dev
Expand Down
4 changes: 4 additions & 0 deletions drivellava/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def __getitem__(self, index):
)


LLAVA_PATH = os.path.abspath("./LLaVA")

COMMAVQ_DIR = os.path.expanduser("~/Datasets/commavq")

# List of all the videos
Expand Down Expand Up @@ -53,6 +55,8 @@ def __getitem__(self, index):

ENCODED_JSON_ALL = ENCODED_JSON + VAL_ENCODED_JSON

VOCAB_JSON = os.path.abspath(os.path.join("media", "vocab.json"))


def get_image_path(encoded_video_path: str, index: int) -> str:
return os.path.join(
Expand Down
29 changes: 14 additions & 15 deletions drivellava/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import re
import sys
from io import BytesIO
Expand All @@ -8,10 +7,7 @@
import torch
from PIL import Image


def image_parser(args):
out = args.image_file.split(args.sep)
return out
from drivellava.constants import LLAVA_PATH


def load_image(image_file):
Expand All @@ -34,26 +30,28 @@ def load_images(image_files):
class DriveLLaVA:
def __init__(self, args):

LLAVA_PATH = os.path.abspath("./LLaVA")

if LLAVA_PATH not in sys.path:
sys.path.append(LLAVA_PATH)

from llava.mm_utils import get_model_name_from_path
# from llava.mm_utils import get_model_name_from_path
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init

# Model Initialization
# Assuming this function disables initialization in PyTorch
disable_torch_init()

self.model_name = get_model_name_from_path(args.model_path)
# self.model_name = get_model_name_from_path(args.model_path)
self.model_name = "liuhaotian/llava-v1.5-7b"
# self.model_name = "llava_llama_2"

print("model_name", self.model_name)
self.tokenizer, self.model, self.image_processor, self.context_len = (
load_pretrained_model(
args.model_path,
args.model_base,
self.model_name,
load_8bit=True,
load_8bit=False,
)
)

Expand All @@ -71,6 +69,8 @@ def __init__(self, args):
else:
self.conv_mode = "llava_v0"

# self.conv_mode = "llava_llama_2"

if args.conv_mode is not None and self.conv_mode != args.conv_mode:
print(
f"[WARNING] the auto inferred conversation mode is "
Expand Down Expand Up @@ -120,7 +120,6 @@ def run(self, query: str, image_files: List[str]):
prompt = conv.get_prompt()

# Process images
# image_files = image_parser(self.args)
images = load_images(image_files)
image_sizes = [x.size for x in images]
images_tensor = process_images(
Expand Down Expand Up @@ -150,9 +149,9 @@ def run(self, query: str, image_files: List[str]):
use_cache=True,
)

outputs = self.tokenizer.batch_decode(
output_ids, skip_special_tokens=True
)[0].strip()
print(outputs)
outputs = self.tokenizer.batch_decode(output_ids)
print("outputs", outputs)

outputs = outputs[0]

return outputs
59 changes: 39 additions & 20 deletions drivellava/scripts/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,20 @@
)
from drivellava.utils import plot_bev_trajectory, plot_steering_traj

# import sys


def main():

fine_tuned_model_path = "liuhaotian/llava-v1.5-7b"
# sys.path.append(LLAVA_PATH)

# from transformers.models.llava.configuration_llava import LlavaConfig

# fine_tuned_model_path = "liuhaotian/llava-v1.5-7b"
fine_tuned_model_path = os.path.expanduser(
"~/Datasets/checkpoints/checkpoint-4000/"
# '~/Datasets/checkpoints/checkpoint-4000/drivellava.bin'
)

args = type(
"Args",
Expand All @@ -34,18 +44,21 @@ def main():
"model_base": None,
# "model_name": get_model_name_from_path(fine_tuned_model_path),
# "query": prompt,
"conv_mode": None,
"conv_mode": "llava_llama_2",
# "image_file": image_file,
# "sep": ",",
"temperature": 0,
"top_p": None,
"num_beams": 1,
"max_new_tokens": 512,
"max_new_tokens": 4,
},
)()

model = DriveLLaVA(args)

print(dir(model.tokenizer))
print(model.tokenizer.get_vocab())

NUM_FRAMES = 20 * 1

encoded_video_path = "/root/Datasets/commavq/val/fe809f0fff5562cc4d2bdc073d242123_31.npy" # noqa
Expand Down Expand Up @@ -79,6 +92,10 @@ def main():
trajectory_encoder=trajectory_encoder,
)

# Save to video
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = None

# Iterate over the embeddings in batches and decode the images
for i in tqdm(
range(0, len(decoded_imgs_list) - NUM_FRAMES, 1),
Expand All @@ -97,24 +114,16 @@ def main():
decoded_imgs_list[i],
],
)
print(
"model_trajectory_quantized",
len(model_trajectory_quantized),
model_trajectory_quantized,
)
model_trajectory_quantized = model_trajectory_quantized[0]
print("Model Trajectory Token: ", model_trajectory_quantized)
model_trajectory_quantized = trajectory_encoder.decode(
model_trajectory_quantized
)

print(
"trajectory[0]",
(np.min(trajectory[:, 0]), np.max(trajectory[:, 0])),
)
print(
"trajectory[1]",
(np.min(trajectory[:, 1]), np.max(trajectory[:, 1])),
)
print(
"trajectory[2]",
(np.min(trajectory[:, 2]), np.max(trajectory[:, 2])),
)
dx = trajectory[1:, 2] - trajectory[:-1, 2]
speed = dx / (1.0 / 20.0)
# m/s to km/h
Expand All @@ -140,14 +149,18 @@ def main():
color=(0, 0, 255),
)

img_bev = plot_bev_trajectory(trajectory, img, color=(255, 0, 0))
img_bev = plot_bev_trajectory(
img_bev_gt = plot_bev_trajectory(trajectory, img, color=(255, 0, 0))
img_bev_gtq = plot_bev_trajectory(
trajectory_quantized, img, color=(0, 255, 0)
)
img_bev = plot_bev_trajectory(
img_bev_pred = plot_bev_trajectory(
model_trajectory_quantized, img, color=(0, 0, 255)
)

# Overlay BEVs
img_bev = cv2.addWeighted(img_bev_gt, 0.5, img_bev_gtq, 0.5, 0)
img_bev = cv2.addWeighted(img_bev, 0.5, img_bev_pred, 0.5, 0)

# Write speed on img
font = cv2.FONT_HERSHEY_SIMPLEX
bottomLeftCornerOfText = (10, 50)
Expand All @@ -170,9 +183,15 @@ def main():

vis = np.concatenate([img, img_bev], axis=1)

if out is None:
out = cv2.VideoWriter(
"test_media/trajectory.mp4", fourcc, 20.0, vis.shape[1::-1]
)

out.write(vis)
cv2.imwrite("test_media/vis.png", vis)

exit()
out.release()


if __name__ == "__main__":
Expand Down
18 changes: 9 additions & 9 deletions drivellava/sparse_llava_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from drivellava.constants import (
COMMAVQ_DIR,
DECODER_ONNX_PATH,
LLAVA_PATH,
get_image_path,
get_json,
)
Expand All @@ -33,7 +34,6 @@
plot_steering_traj,
)

LLAVA_PATH = os.path.abspath("./LLaVA")
if LLAVA_PATH not in sys.path:
sys.path.append(LLAVA_PATH)

Expand Down Expand Up @@ -125,14 +125,14 @@ def get_drivellava_prompt(trajectory_encoder: TrajectoryEncoder):
+ "You may select one from the "
+ f"following templates: {traj_str}"
)
P2 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, the autonomous vehicle, your task is to analyze the given image and determine the optimal driving path. Choose the most suitable trajectory option from the list provided based on the visual information. {traj_str}"""
P3 = f"""{DEFAULT_IMAGE_TOKEN} You are the AI system DriveLLaVA, responsible for navigating self-driving cars. With the image provided as your guide, select the correct trajectory from the options below to ensure a safe and efficient route. {traj_str}"""
P4 = f"""{DEFAULT_IMAGE_TOKEN} Imagine yourself as DriveLLaVA, an advanced self-driving vehicle intelligence. Examine the scenario depicted in the image and decide on the best course of action by selecting an appropriate trajectory from the given templates. {traj_str}"""
P5 = f"""{DEFAULT_IMAGE_TOKEN} You embody DriveLLaVA, the brain behind autonomous driving technology. Given the context shown in the image, it's your job to pick the right trajectory from the available choices to navigate safely. {traj_str}"""
P6 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, a pioneering self-driving car AI, you're tasked with interpreting the visual cues in the provided image to choose the most suitable trajectory from the list of options to ensure a smooth journey. {traj_str}"""
P7 = f"""{DEFAULT_IMAGE_TOKEN} You, as DriveLLaVA, are at the forefront of autonomous navigation. Assess the situation depicted in the image and select the trajectory that best aligns with safe and efficient driving principles from the options provided. {traj_str}"""
P8 = f"""{DEFAULT_IMAGE_TOKEN} Functioning as DriveLLaVA, the self-driving car's decision-making system, you must look at the image and determine the best path forward by choosing from the predefined trajectory templates. {traj_str}"""
P9 = f"""{DEFAULT_IMAGE_TOKEN} You are DriveLLaVA, an AI designed for autonomous vehicles. Your objective is to analyze the context presented in the image and select a trajectory that guarantees the safety and comfort of your passengers from the given templates. {traj_str}"""
P2 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, the autonomous vehicle, your task is to analyze the given image and determine the optimal driving path. Choose the most suitable trajectory option from the list provided based on the visual information. {traj_str}""" # noqa
P3 = f"""{DEFAULT_IMAGE_TOKEN} You are the AI system DriveLLaVA, responsible for navigating self-driving cars. With the image provided as your guide, select the correct trajectory from the options below to ensure a safe and efficient route. {traj_str}""" # noqa
P4 = f"""{DEFAULT_IMAGE_TOKEN} Imagine yourself as DriveLLaVA, an advanced self-driving vehicle intelligence. Examine the scenario depicted in the image and decide on the best course of action by selecting an appropriate trajectory from the given templates. {traj_str}""" # noqa
P5 = f"""{DEFAULT_IMAGE_TOKEN} You embody DriveLLaVA, the brain behind autonomous driving technology. Given the context shown in the image, it's your job to pick the right trajectory from the available choices to navigate safely. {traj_str}""" # noqa
P6 = f"""{DEFAULT_IMAGE_TOKEN} As DriveLLaVA, a pioneering self-driving car AI, you're tasked with interpreting the visual cues in the provided image to choose the most suitable trajectory from the list of options to ensure a smooth journey. {traj_str}""" # noqa
P7 = f"""{DEFAULT_IMAGE_TOKEN} You, as DriveLLaVA, are at the forefront of autonomous navigation. Assess the situation depicted in the image and select the trajectory that best aligns with safe and efficient driving principles from the options provided. {traj_str}""" # noqa
P8 = f"""{DEFAULT_IMAGE_TOKEN} Functioning as DriveLLaVA, the self-driving car's decision-making system, you must look at the image and determine the best path forward by choosing from the predefined trajectory templates. {traj_str}""" # noqa
P9 = f"""{DEFAULT_IMAGE_TOKEN} You are DriveLLaVA, an AI designed for autonomous vehicles. Your objective is to analyze the context presented in the image and select a trajectory that guarantees the safety and comfort of your passengers from the given templates. {traj_str}""" # noqa

return random.choice([P1, P2, P3, P4, P5, P6, P7, P8, P9])

Expand Down
19 changes: 17 additions & 2 deletions drivellava/trajectory_encoder.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
import json
import os
import pickle
from typing import List

import numpy as np

from drivellava.constants import VOCAB_JSON

NUM_TRAJECTORY_TEMPLATES = 256
TRAJECTORY_SIZE = 20
TRAJECTORY_TEMPLATES_NPY = f"./trajectory_templates/proposed_trajectory_templates_{NUM_TRAJECTORY_TEMPLATES}.npy" # noqa
TRAJECTORY_TEMPLATES_KMEANS_PKL = (
f"./trajectory_templates/kmeans_{NUM_TRAJECTORY_TEMPLATES}.pkl"
)
ENCODING = "UTF-8"
ENCODING = "utf-8"


class TrajectoryEncoder:
Expand All @@ -21,12 +24,18 @@ def __init__(
trajectory_size=TRAJECTORY_SIZE,
trajectory_templates_npy=TRAJECTORY_TEMPLATES_NPY,
trajectory_templates_kmeans_pkl=TRAJECTORY_TEMPLATES_KMEANS_PKL,
vocab_json=VOCAB_JSON,
) -> None:
self.num_trajectory_templates = num_trajectory_templates
self.trajectory_templates_npy = trajectory_templates_npy
self.trajectory_templates_kmeans_pkl = trajectory_templates_kmeans_pkl
self.trajectory_size = trajectory_size

with open(vocab_json, "r", encoding=ENCODING) as f:
self.vocab_json = json.load(f)

self.vocab_json_inv = {v: k for k, v in self.vocab_json.items()}

assert os.path.exists(
trajectory_templates_npy
), f"File {trajectory_templates_npy} does not exist"
Expand All @@ -52,7 +61,13 @@ def __init__(
with open(trajectory_templates_kmeans_pkl, "rb") as f:
self.kmeans = pickle.load(f)

self.TOKEN_IDS: List[str] = []
self.start_token_id = 31500
self.end_token_id = self.start_token_id + self.num_trajectory_templates

self.TOKEN_IDS: List[str] = [
self.vocab_json_inv[i + self.start_token_id]
for i in range(self.num_trajectory_templates)
]

index = 0

Expand Down
Loading

0 comments on commit 7c113ef

Please sign in to comment.