From cf77ce829e0fefd13df4dfb61b2d9755614df787 Mon Sep 17 00:00:00 2001 From: mpc Date: Wed, 9 Oct 2024 08:06:57 +0100 Subject: [PATCH] Added sample notebook running vllm --- notebooks/vllm_test.ipynb | 153 ++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 notebooks/vllm_test.ipynb diff --git a/notebooks/vllm_test.ipynb b/notebooks/vllm_test.ipynb new file mode 100644 index 0000000..e8b19df --- /dev/null +++ b/notebooks/vllm_test.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from vllm import LLM, SamplingParams" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "prompts = [\n", + " \"Tell me a joke.\"\n", + "]\n", + "params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"HF_TOKEN\"] = \"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\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING 09-27 11:20:50 config.py:319] bitsandbytes quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n", + "INFO 09-27 11:20:50 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit', speculative_config=None, tokenizer='unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit, use_v2_block_manager=False, num_scheduler_steps=1, multi_step_stream_outputs=False, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, mm_processor_kwargs=None)\n", + "INFO 09-27 11:20:51 model_runner.py:1014] Starting to load model unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit...\n", + "INFO 09-27 11:20:51 loader.py:1014] Loading weights with BitsAndBytes quantization. May take a while ...\n", + "INFO 09-27 11:20:51 weight_utils.py:242] Using model weights format ['*.safetensors']\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1591f28b46054d24890b33e117b5ddc4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00