From 8a15e25e5fac89d6153a9a63c16177284aae8d91 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 12 Dec 2023 20:56:26 +0400 Subject: [PATCH] fix chatglm convert (#1535) --- .../254-llm-chatbot/254-llm-chatbot.ipynb | 447 +++++++++++++++++- .../254-llm-chatbot/254-rag-chatbot.ipynb | 3 +- notebooks/254-llm-chatbot/converter.py | 41 ++ 3 files changed, 464 insertions(+), 27 deletions(-) diff --git a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb index 15f4647bcc8..5e9294b80e5 100644 --- a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb +++ b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb @@ -129,12 +129,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "12869548c4ab4da5bc48a30e9b87232d", + "model_id": "7723267ce24448fcbe4658b6e20cd404", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Model:', options=('tiny-llama-1b-chat', 'red-pajama-3b-chat', 'llama-2-chat-7b', 'mpt-7b…" + "Dropdown(description='Model:', index=5, options=('tiny-llama-1b-chat', 'red-pajama-3b-chat', 'llama-2-chat-7b'…" ] }, "execution_count": 3, @@ -167,7 +167,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Selected model tiny-llama-1b-chat\n" + "Selected model chatglm2-6b\n" ] } ], @@ -284,7 +284,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e6a1aaf797854154b634266c484d0473", + "model_id": "9ad05ac3909b405582d93594f549fd94", "version_major": 2, "version_minor": 0 }, @@ -298,7 +298,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "45fed299cf854328a8781df5e40c34dd", + "model_id": "ff880044976144c1b775423eaa85fcbb", "version_major": 2, "version_minor": 0 }, @@ -312,7 +312,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f9665924a89a42f2872dd8b85815da78", + "model_id": "00e933d04a98467a97ee0fd5aacd2c93", "version_major": 2, "version_minor": 0 }, @@ -358,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "c4ef9112", "metadata": { "collapsed": false, @@ -366,7 +366,82 @@ "outputs_hidden": false } }, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "86350f1ebcba4f22a502ef4e6419fb69", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f486dd77a1a04c8295a25d2df0cd3f3f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "nncf.set_log_level(logging.ERROR)\n", "\n", @@ -466,7 +541,6 @@ " \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n", " \"group_size\": 128,\n", " \"ratio\": 0.72,\n", - " \"ignored_scope\": nncf.IgnoredScope([\"__module.transformer/aten::index_67/Gather\"])\n", " },\n", " \"qwen-7b-chat\": {\n", " \"mode\": nncf.CompressWeightsMode.INT4_SYM, \n", @@ -543,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "281f1d07-998e-4e13-ba95-0264564ede82", "metadata": {}, "outputs": [ @@ -551,7 +625,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Size of model with INT4 compressed weights is 696.99 MB\n" + "Size of FP16 model is 11912.69 MB\n", + "Size of model with INT4 compressed weights is 4067.25 MB\n", + "Compression rate for INT4 model: 2.929\n" ] } ], @@ -586,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "837b4a3b-ccc3-4004-9577-2b2c7b802dea", "metadata": { "tags": [] @@ -595,7 +671,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9c70e3b670b143b9bdbb89b01452eb19", + "model_id": "4af32a190c7a4896a06743fe05c7b56b", "version_major": 2, "version_minor": 0 }, @@ -603,7 +679,7 @@ "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU')" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -630,7 +706,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "5333ab9b-ff5d-4a7f-bcdc-9cca5d56dc0a", "metadata": { "tags": [] @@ -652,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "3536a1a7", "metadata": { "collapsed": false, @@ -664,15 +740,15 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f1f433117e474ab99409f32b1b686941", + "model_id": "0f954bac863d48f5ab0b9eb779f0a82d", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Model to run:', options=('INT4',), value='INT4')" + "Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -698,7 +774,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "7a041101-7336-40fd-96c9-cd298015a0f3", "metadata": { "tags": [] @@ -708,7 +784,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Loading model from tiny-llama-1b-chat/INT4_compressed_weights\n" + "Loading model from chatglm2-6b/INT4_compressed_weights\n" ] }, { @@ -753,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "8f6f7596-5677-4931-875b-aaabfa23cabc", "metadata": {}, "outputs": [ @@ -761,8 +837,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n", - "/home/ea/work/genai_env/lib/python3.8/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", + "/home/ea/work/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", " self.request.start_async(inputs, shared_memory=True)\n" ] }, @@ -1208,7 +1283,329 @@ }, "widgets": { "application/vnd.jupyter.widget-state+json": { - "state": {}, + "state": { + "00e933d04a98467a97ee0fd5aacd2c93": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "CheckboxModel", + "state": { + "description": "Prepare FP16 model", + "disabled": false, + "layout": "IPY_MODEL_64ad62f21060421aabbd8456c48697d3", + "style": "IPY_MODEL_a39a1b814f3d4fd5b535216f429ae000", + "value": false + } + }, + "0f954bac863d48f5ab0b9eb779f0a82d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "INT4", + "FP16" + ], + "description": "Model to run:", + "index": 0, + "layout": "IPY_MODEL_b2994422e6b34af9bff3d3c0cf1f161d", + "style": "IPY_MODEL_a4b7e815e1ab4f808a22dc551e8a6b36" + } + }, + "0feae37e49714d3caf29da081719dc69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "CheckboxStyleModel", + "state": { + "description_width": "" + } + }, + "107ea5d1ee5e4720b57eee0b12c341f0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "1f7ebe01ae0f49b3aa26fefe1d2c9630": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "375501ad8742471ca56c5ffcac5ee9f8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "4910e7d802ec4b2683801e7ac17ca700": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_5778c5124e5641cd88242830477a7ab3", + "max": 7, + "style": "IPY_MODEL_bf691f1d350149d69dff8cf5f08dd9e8", + "value": 7 + } + }, + "4af32a190c7a4896a06743fe05c7b56b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "CPU", + "GPU.0", + "GPU.1", + "AUTO" + ], + "description": "Device:", + "index": 0, + "layout": "IPY_MODEL_aa16bf7c3b94482295f85dec38155ee1", + "style": "IPY_MODEL_4fdefc6d2c57438380fa16eb0645ab3a" + } + }, + "4fdefc6d2c57438380fa16eb0645ab3a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "54390732209e40b18d54807210fd46ce": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "5778c5124e5641cd88242830477a7ab3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "5ab2c71dcf87422cbde965d6f0661ef0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_1f7ebe01ae0f49b3aa26fefe1d2c9630", + "style": "IPY_MODEL_b469729a7d3140b0b035aaa8e7733186", + "value": " 7/7 [00:08<00:00, 1.13s/it]" + } + }, + "64ad62f21060421aabbd8456c48697d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "7723267ce24448fcbe4658b6e20cd404": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "tiny-llama-1b-chat", + "red-pajama-3b-chat", + "llama-2-chat-7b", + "mpt-7b-chat", + "qwen-7b-chat", + "chatglm2-6b", + "mistal-7b", + "zephyr-7b-beta", + "neural-chat-7b-v3-1", + "notus-7b-v1", + "youri-7b-chat" + ], + "description": "Model:", + "index": 5, + "layout": "IPY_MODEL_375501ad8742471ca56c5ffcac5ee9f8", + "style": "IPY_MODEL_f62810db78ab4b4ba44f59e98baf7333" + } + }, + "86350f1ebcba4f22a502ef4e6419fb69": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_107ea5d1ee5e4720b57eee0b12c341f0", + "outputs": [ + { + "data": { + "text/html": "
Searching for Mixed-Precision Configuration ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 112/1120:04:360:00:00\n
\n", + "text/plain": "Searching for Mixed-Precision Configuration \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m112/112\u001b[0m • \u001b[38;2;0;104;181m0:04:36\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "9ad05ac3909b405582d93594f549fd94": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "CheckboxModel", + "state": { + "description": "Prepare INT4 model", + "disabled": false, + "layout": "IPY_MODEL_54390732209e40b18d54807210fd46ce", + "style": "IPY_MODEL_0feae37e49714d3caf29da081719dc69", + "value": true + } + }, + "9df482a97a8c48048c1b14d023c192be": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_fe514aa9268c40d79cbcce1266bc47c9", + "style": "IPY_MODEL_e19a4ef87232458081076551a2f030b2", + "value": "Loading checkpoint shards: 100%" + } + }, + "a39a1b814f3d4fd5b535216f429ae000": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "CheckboxStyleModel", + "state": { + "description_width": "" + } + }, + "a4b7e815e1ab4f808a22dc551e8a6b36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "aa16bf7c3b94482295f85dec38155ee1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "b2994422e6b34af9bff3d3c0cf1f161d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "b469729a7d3140b0b035aaa8e7733186": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "bba76ebbf6d741fcbc92b3a210464899": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_9df482a97a8c48048c1b14d023c192be", + "IPY_MODEL_4910e7d802ec4b2683801e7ac17ca700", + "IPY_MODEL_5ab2c71dcf87422cbde965d6f0661ef0" + ], + "layout": "IPY_MODEL_eeb37297a8f94696bd754a96f0057b12" + } + }, + "bc30bb99906245b18e4c752439fe8f03": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "CheckboxStyleModel", + "state": { + "description_width": "" + } + }, + "bf691f1d350149d69dff8cf5f08dd9e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "dda49cfd9dd04af69ba91d10f8e4e175": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "e19a4ef87232458081076551a2f030b2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "eeb37297a8f94696bd754a96f0057b12": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "f1d5bbc922ce40d6885cb8ee7f6f9e50": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "f486dd77a1a04c8295a25d2df0cd3f3f": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_dda49cfd9dd04af69ba91d10f8e4e175", + "outputs": [ + { + "data": { + "text/html": "
Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 114/1140:06:090:00:00\n
\n", + "text/plain": "Applying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m114/114\u001b[0m • \u001b[38;2;0;104;181m0:06:09\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "f62810db78ab4b4ba44f59e98baf7333": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "fe514aa9268c40d79cbcce1266bc47c9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "ff880044976144c1b775423eaa85fcbb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "CheckboxModel", + "state": { + "description": "Prepare INT8 model", + "disabled": false, + "layout": "IPY_MODEL_f1d5bbc922ce40d6885cb8ee7f6f9e50", + "style": "IPY_MODEL_bc30bb99906245b18e4c752439fe8f03", + "value": false + } + } + }, "version_major": 2, "version_minor": 0 } diff --git a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb index ac960aa4303..4caa0e05abb 100644 --- a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb +++ b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb @@ -428,8 +428,7 @@ " \"chatglm2-6b\": {\n", " \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n", " \"group_size\": 128,\n", - " \"ratio\": 0.72,\n", - " \"ignored_scope\": nncf.IgnoredScope([\"__module.transformer/aten::index_67/Gather\"])\n", + " \"ratio\": 0.72\n", " },\n", " \"qwen-7b-chat\": {\n", " \"mode\": nncf.CompressWeightsMode.INT4_SYM, \n", diff --git a/notebooks/254-llm-chatbot/converter.py b/notebooks/254-llm-chatbot/converter.py index c54ada629f7..87210da4399 100644 --- a/notebooks/254-llm-chatbot/converter.py +++ b/notebooks/254-llm-chatbot/converter.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Tuple, Optional import types +from transformers.modeling_outputs import BaseModelOutputWithPast def flattenize_inputs(inputs): @@ -172,6 +173,41 @@ def convert_qwen(pt_model: torch.nn.Module, model_path: Path): del pt_model +@torch.jit.script_if_tracing +def _chatglm2_get_context_layer(query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor): + mask = torch.zeros((query_layer.shape[-2], key_layer.shape[-2]), dtype=query_layer.dtype) + if query_layer.shape[2] == key_layer.shape[2]: + tmp_mask = torch.ones((query_layer.shape[-2], key_layer.shape[-2]), dtype=torch.bool).triu(diagonal=1) + mask.masked_fill_(tmp_mask, float("-inf")) + + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, attn_mask=mask) + return context_layer + + +def _core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask): + query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] + if attention_mask is None: + context_layer = _chatglm2_get_context_layer(query_layer, key_layer, value_layer) + else: + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, key_layer, value_layer, attention_mask + ) + context_layer = context_layer.permute(2, 0, 1, 3) + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + + return context_layer + + +@torch.jit.script_if_tracing +def _get_chatglm_attention_mask(input_ids, past_key): + mask = torch.zeros((input_ids.shape[1], past_key.shape[0] + input_ids.shape[1]), dtype=past_key.dtype) + if past_key.shape[0] == 0: + tmp_mask = torch.ones((input_ids.shape[1], past_key.shape[0] + input_ids.shape[1]), dtype=torch.bool).triu(diagonal=1) + mask.masked_fill_(tmp_mask, float("-inf")) + return mask + + def _chatglm_transformer_forward( self, input_ids, @@ -245,6 +281,11 @@ def _chatglm_transformer_forward( def _patch_chatglm_forward(model: "PreTrainedModel"): model.transformer.forward = types.MethodType(_chatglm_transformer_forward, model.transformer) + for block in model.transformer.encoder.layers: + block.self_attention.core_attention.forward = types.MethodType( + _core_attention_forward, block.self_attention.core_attention + ) + def convert_chatglm2(pt_model: torch.nn.Module, model_path: Path): """