diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb index 0be35000a5..953b529b60 100644 --- a/notebooks/openvino/sentence_transformer_quantization.ipynb +++ b/notebooks/openvino/sentence_transformer_quantization.ipynb @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -53,7 +53,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "53d4d1f1703a4e52812ea366c06f2d67", + "model_id": "a9bd847756fd467e905a7ad7a243640c", "version_major": 2, "version_minor": 0 }, @@ -77,7 +77,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a3de9a9bbdd942069b85519c83267f83", + "model_id": "9d8ad91623d642f48e85b60ac823aca4", "version_major": 2, "version_minor": 0 }, @@ -101,7 +101,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ebc55f3ce3974aaa8861474699d5a15f", + "model_id": "a2a7d09a573c4092a830bbaadc39f756", "version_major": 2, "version_minor": 0 }, @@ -125,7 +125,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f206e4e8651f4f449f9dcb1fc11ef266", + "model_id": "b67c493aab36426090f8fafd25a17a00", "version_major": 2, "version_minor": 0 }, @@ -163,7 +163,7 @@ " 'all-MiniLM-L6-v2_int8/tokenizer.json')" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -175,16 +175,17 @@ "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n", "\n", "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", + "base_model_path = \"all-MiniLM-L6-v2\"\n", + "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n", "\n", "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n", - "model.save_pretrained(\"all-MiniLM-L6-v2\")\n", + "model.save_pretrained(base_model_path)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n", - "tokenizer.save_pretrained(\"all-MiniLM-L6-v2\")\n", + "tokenizer.save_pretrained(base_model_path)\n", + "\n", "DATASET_NAME = \"squad\"\n", "dataset = datasets.load_dataset(DATASET_NAME)\n", - "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n", - "\n", "quantizer = OVQuantizer.from_pretrained(model)\n", "\n", "\n", @@ -222,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -262,25 +263,26 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n", - "Framework not specified. Using pt to export the model.\n", - "Using framework PyTorch: 2.4.1+cpu\n", - "Overriding 1 configuration item(s)\n", - "\t- use_cache -> False\n", - "Compiling the model to CPU ...\n", + "Compiling the model to CPU ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ "Compiling the model to CPU ...\n" ] } ], "source": [ - "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n", + "model = OVModelForFeatureExtraction.from_pretrained(base_model_path)\n", "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n", "\n", "q_model = OVModelForFeatureExtraction.from_pretrained(int8_ptq_model_path)\n", @@ -289,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -302,20 +304,20 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n" + "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0f28df147f95484c955c3f20f2f954d2", + "model_id": "5cab9e8fc58245a4b395a9575017633b", "version_major": 2, "version_minor": 0 }, @@ -350,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -358,7 +360,7 @@ "output_type": "stream", "text": [ "vanilla model: pearson= 0.869619439095004\n", - "quantized model: pearson= 0.869326218489249\n", + "quantized model: pearson= 0.869415534480936\n", "The quantized model achieves 100.0 % accuracy of the fp32 model\n" ] } @@ -392,7 +394,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -413,18 +415,18 @@ "[ INFO ] Parsing input parameters\n", "[Step 2/11] Loading OpenVINO Runtime\n", "[ INFO ] OpenVINO:\n", - "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n", + "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n", "[ INFO ] \n", "[ INFO ] Device info:\n", "[ INFO ] CPU\n", - "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n", + "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n", "[ INFO ] \n", "[ INFO ] \n", "[Step 3/11] Setting device configuration\n", "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n", "[Step 4/11] Reading model files\n", "[ INFO ] Loading model files\n", - "[ INFO ] Read model took 10.87 ms\n", + "[ INFO ] Read model took 10.17 ms\n", "[ INFO ] Original model I/O parameters:\n", "[ INFO ] Model inputs:\n", "[ INFO ] input_ids (node: input_ids) : i64 / [...] / [?,?]\n", @@ -435,7 +437,7 @@ "[Step 5/11] Resizing model to match image sizes and given batch\n", "[ INFO ] Model batch size: 1\n", "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n", - "[ INFO ] Reshape model took 3.02 ms\n", + "[ INFO ] Reshape model took 2.23 ms\n", "[Step 6/11] Configuring input of the model\n", "[ INFO ] Model inputs:\n", "[ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,384]\n", @@ -444,7 +446,7 @@ "[ INFO ] Model outputs:\n", "[ INFO ] last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n", "[Step 7/11] Loading the model to the device\n", - "[ INFO ] Compile model took 125.14 ms\n", + "[ INFO ] Compile model took 134.63 ms\n", "[Step 8/11] Querying optimal runtime parameters\n", "[ INFO ] Model:\n", "[ INFO ] NETWORK_NAME: Model0\n", @@ -476,22 +478,16 @@ "[ INFO ] Fill input 'token_type_ids' with random values \n", "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n", "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n", - "[ INFO ] First inference took 13.97 ms\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "[ INFO ] First inference took 12.27 ms\n", "[Step 11/11] Dumping statistics report\n", "[ INFO ] Execution Devices:['CPU']\n", "[ INFO ] Count: 200 iterations\n", - "[ INFO ] Duration: 1988.82 ms\n", + "[ INFO ] Duration: 1988.84 ms\n", "[ INFO ] Latency:\n", - "[ INFO ] Median: 9.70 ms\n", + "[ INFO ] Median: 9.74 ms\n", "[ INFO ] Average: 9.77 ms\n", - "[ INFO ] Min: 9.54 ms\n", - "[ INFO ] Max: 11.35 ms\n", + "[ INFO ] Min: 9.59 ms\n", + "[ INFO ] Max: 11.12 ms\n", "[ INFO ] Throughput: 100.56 FPS\n" ] } @@ -503,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -524,18 +520,18 @@ "[ INFO ] Parsing input parameters\n", "[Step 2/11] Loading OpenVINO Runtime\n", "[ INFO ] OpenVINO:\n", - "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n", + "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n", "[ INFO ] \n", "[ INFO ] Device info:\n", "[ INFO ] CPU\n", - "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n", + "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n", "[ INFO ] \n", "[ INFO ] \n", "[Step 3/11] Setting device configuration\n", "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n", "[Step 4/11] Reading model files\n", "[ INFO ] Loading model files\n", - "[ INFO ] Read model took 15.46 ms\n", + "[ INFO ] Read model took 20.87 ms\n", "[ INFO ] Original model I/O parameters:\n", "[ INFO ] Model inputs:\n", "[ INFO ] input_ids (node: input_ids) : i64 / [...] / [?,?]\n", @@ -546,7 +542,7 @@ "[Step 5/11] Resizing model to match image sizes and given batch\n", "[ INFO ] Model batch size: 1\n", "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n", - "[ INFO ] Reshape model took 6.89 ms\n", + "[ INFO ] Reshape model took 3.42 ms\n", "[Step 6/11] Configuring input of the model\n", "[ INFO ] Model inputs:\n", "[ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,384]\n", @@ -555,7 +551,7 @@ "[ INFO ] Model outputs:\n", "[ INFO ] last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n", "[Step 7/11] Loading the model to the device\n", - "[ INFO ] Compile model took 325.40 ms\n", + "[ INFO ] Compile model took 323.91 ms\n", "[Step 8/11] Querying optimal runtime parameters\n", "[ INFO ] Model:\n", "[ INFO ] NETWORK_NAME: Model0\n", @@ -587,17 +583,17 @@ "[ INFO ] Fill input 'token_type_ids' with random values \n", "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n", "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n", - "[ INFO ] First inference took 8.49 ms\n", + "[ INFO ] First inference took 6.72 ms\n", "[Step 11/11] Dumping statistics report\n", "[ INFO ] Execution Devices:['CPU']\n", "[ INFO ] Count: 200 iterations\n", - "[ INFO ] Duration: 869.96 ms\n", + "[ INFO ] Duration: 853.85 ms\n", "[ INFO ] Latency:\n", - "[ INFO ] Median: 4.17 ms\n", - "[ INFO ] Average: 4.23 ms\n", - "[ INFO ] Min: 4.08 ms\n", - "[ INFO ] Max: 6.04 ms\n", - "[ INFO ] Throughput: 229.89 FPS\n" + "[ INFO ] Median: 4.13 ms\n", + "[ INFO ] Average: 4.15 ms\n", + "[ INFO ] Min: 4.05 ms\n", + "[ INFO ] Max: 5.13 ms\n", + "[ INFO ] Throughput: 234.23 FPS\n" ] } ],