From 8a15e25e5fac89d6153a9a63c16177284aae8d91 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 12 Dec 2023 20:56:26 +0400
Subject: [PATCH] fix chatglm convert (#1535)

---
 .../254-llm-chatbot/254-llm-chatbot.ipynb     | 447 +++++++++++++++++-
 .../254-llm-chatbot/254-rag-chatbot.ipynb     |   3 +-
 notebooks/254-llm-chatbot/converter.py        |  41 ++
 3 files changed, 464 insertions(+), 27 deletions(-)
diff --git a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
index 15f4647bcc8..5e9294b80e5 100644
--- a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
+++ b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
@@ -129,12 +129,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "12869548c4ab4da5bc48a30e9b87232d",
+       "model_id": "7723267ce24448fcbe4658b6e20cd404",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Model:', options=('tiny-llama-1b-chat', 'red-pajama-3b-chat', 'llama-2-chat-7b', 'mpt-7b…"
+       "Dropdown(description='Model:', index=5, options=('tiny-llama-1b-chat', 'red-pajama-3b-chat', 'llama-2-chat-7b'…"
       ]
      },
      "execution_count": 3,
@@ -167,7 +167,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Selected model tiny-llama-1b-chat\n"
+      "Selected model chatglm2-6b\n"
      ]
     }
    ],
@@ -284,7 +284,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e6a1aaf797854154b634266c484d0473",
+       "model_id": "9ad05ac3909b405582d93594f549fd94",
        "version_major": 2,
        "version_minor": 0
       },
@@ -298,7 +298,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "45fed299cf854328a8781df5e40c34dd",
+       "model_id": "ff880044976144c1b775423eaa85fcbb",
        "version_major": 2,
        "version_minor": 0
       },
@@ -312,7 +312,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f9665924a89a42f2872dd8b85815da78",
+       "model_id": "00e933d04a98467a97ee0fd5aacd2c93",
        "version_major": 2,
        "version_minor": 0
       },
@@ -358,7 +358,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "c4ef9112",
    "metadata": {
     "collapsed": false,
@@ -366,7 +366,82 @@
      "outputs_hidden": false
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "86350f1ebcba4f22a502ef4e6419fb69",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f486dd77a1a04c8295a25d2df0cd3f3f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "nncf.set_log_level(logging.ERROR)\n",
     "\n",
@@ -466,7 +541,6 @@
     "            \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n",
     "            \"group_size\": 128,\n",
     "            \"ratio\": 0.72,\n",
-    "            \"ignored_scope\": nncf.IgnoredScope([\"__module.transformer/aten::index_67/Gather\"])\n",
     "        },\n",
     "        \"qwen-7b-chat\": {\n",
     "            \"mode\": nncf.CompressWeightsMode.INT4_SYM, \n",
@@ -543,7 +617,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "281f1d07-998e-4e13-ba95-0264564ede82",
    "metadata": {},
    "outputs": [
@@ -551,7 +625,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Size of model with INT4 compressed weights is 696.99 MB\n"
+      "Size of FP16 model is 11912.69 MB\n",
+      "Size of model with INT4 compressed weights is 4067.25 MB\n",
+      "Compression rate for INT4 model: 2.929\n"
      ]
     }
    ],
@@ -586,7 +662,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "837b4a3b-ccc3-4004-9577-2b2c7b802dea",
    "metadata": {
     "tags": []
@@ -595,7 +671,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9c70e3b670b143b9bdbb89b01452eb19",
+       "model_id": "4af32a190c7a4896a06743fe05c7b56b",
        "version_major": 2,
        "version_minor": 0
       },
@@ -603,7 +679,7 @@
        "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU')"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -630,7 +706,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "5333ab9b-ff5d-4a7f-bcdc-9cca5d56dc0a",
    "metadata": {
     "tags": []
@@ -652,7 +728,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "3536a1a7",
    "metadata": {
     "collapsed": false,
@@ -664,15 +740,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f1f433117e474ab99409f32b1b686941",
+       "model_id": "0f954bac863d48f5ab0b9eb779f0a82d",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Model to run:', options=('INT4',), value='INT4')"
+       "Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -698,7 +774,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "7a041101-7336-40fd-96c9-cd298015a0f3",
    "metadata": {
     "tags": []
@@ -708,7 +784,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loading model from tiny-llama-1b-chat/INT4_compressed_weights\n"
+      "Loading model from chatglm2-6b/INT4_compressed_weights\n"
      ]
     },
     {
@@ -753,7 +829,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "8f6f7596-5677-4931-875b-aaabfa23cabc",
    "metadata": {},
    "outputs": [
@@ -761,8 +837,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
-      "/home/ea/work/genai_env/lib/python3.8/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
+      "/home/ea/work/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
       "  self.request.start_async(inputs, shared_memory=True)\n"
      ]
     },
@@ -1208,7 +1283,329 @@
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
-    "state": {},
+    "state": {
+     "00e933d04a98467a97ee0fd5aacd2c93": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxModel",
+      "state": {
+       "description": "Prepare FP16 model",
+       "disabled": false,
+       "layout": "IPY_MODEL_64ad62f21060421aabbd8456c48697d3",
+       "style": "IPY_MODEL_a39a1b814f3d4fd5b535216f429ae000",
+       "value": false
+      }
+     },
+     "0f954bac863d48f5ab0b9eb779f0a82d": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DropdownModel",
+      "state": {
+       "_options_labels": [
+        "INT4",
+        "FP16"
+       ],
+       "description": "Model to run:",
+       "index": 0,
+       "layout": "IPY_MODEL_b2994422e6b34af9bff3d3c0cf1f161d",
+       "style": "IPY_MODEL_a4b7e815e1ab4f808a22dc551e8a6b36"
+      }
+     },
+     "0feae37e49714d3caf29da081719dc69": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "107ea5d1ee5e4720b57eee0b12c341f0": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "1f7ebe01ae0f49b3aa26fefe1d2c9630": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "375501ad8742471ca56c5ffcac5ee9f8": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "4910e7d802ec4b2683801e7ac17ca700": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "bar_style": "success",
+       "layout": "IPY_MODEL_5778c5124e5641cd88242830477a7ab3",
+       "max": 7,
+       "style": "IPY_MODEL_bf691f1d350149d69dff8cf5f08dd9e8",
+       "value": 7
+      }
+     },
+     "4af32a190c7a4896a06743fe05c7b56b": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DropdownModel",
+      "state": {
+       "_options_labels": [
+        "CPU",
+        "GPU.0",
+        "GPU.1",
+        "AUTO"
+       ],
+       "description": "Device:",
+       "index": 0,
+       "layout": "IPY_MODEL_aa16bf7c3b94482295f85dec38155ee1",
+       "style": "IPY_MODEL_4fdefc6d2c57438380fa16eb0645ab3a"
+      }
+     },
+     "4fdefc6d2c57438380fa16eb0645ab3a": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "54390732209e40b18d54807210fd46ce": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "5778c5124e5641cd88242830477a7ab3": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "5ab2c71dcf87422cbde965d6f0661ef0": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "layout": "IPY_MODEL_1f7ebe01ae0f49b3aa26fefe1d2c9630",
+       "style": "IPY_MODEL_b469729a7d3140b0b035aaa8e7733186",
+       "value": " 7/7 [00:08&lt;00:00,  1.13s/it]"
+      }
+     },
+     "64ad62f21060421aabbd8456c48697d3": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "7723267ce24448fcbe4658b6e20cd404": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DropdownModel",
+      "state": {
+       "_options_labels": [
+        "tiny-llama-1b-chat",
+        "red-pajama-3b-chat",
+        "llama-2-chat-7b",
+        "mpt-7b-chat",
+        "qwen-7b-chat",
+        "chatglm2-6b",
+        "mistal-7b",
+        "zephyr-7b-beta",
+        "neural-chat-7b-v3-1",
+        "notus-7b-v1",
+        "youri-7b-chat"
+       ],
+       "description": "Model:",
+       "index": 5,
+       "layout": "IPY_MODEL_375501ad8742471ca56c5ffcac5ee9f8",
+       "style": "IPY_MODEL_f62810db78ab4b4ba44f59e98baf7333"
+      }
+     },
+     "86350f1ebcba4f22a502ef4e6419fb69": {
+      "model_module": "@jupyter-widgets/output",
+      "model_module_version": "1.0.0",
+      "model_name": "OutputModel",
+      "state": {
+       "layout": "IPY_MODEL_107ea5d1ee5e4720b57eee0b12c341f0",
+       "outputs": [
+        {
+         "data": {
+          "text/html": "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Searching for Mixed-Precision Configuration <span style=\"color: #729c1f; text-decoration-color: #729c1f\">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span style=\"color: #800080; text-decoration-color: #800080\">100%</span> <span style=\"color: #0068b5; text-decoration-color: #0068b5\">112/112</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:04:36</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:00</span>\n</pre>\n",
+          "text/plain": "Searching for Mixed-Precision Configuration \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m112/112\u001b[0m • \u001b[38;2;0;104;181m0:04:36\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n"
+         },
+         "metadata": {},
+         "output_type": "display_data"
+        }
+       ]
+      }
+     },
+     "9ad05ac3909b405582d93594f549fd94": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxModel",
+      "state": {
+       "description": "Prepare INT4 model",
+       "disabled": false,
+       "layout": "IPY_MODEL_54390732209e40b18d54807210fd46ce",
+       "style": "IPY_MODEL_0feae37e49714d3caf29da081719dc69",
+       "value": true
+      }
+     },
+     "9df482a97a8c48048c1b14d023c192be": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "layout": "IPY_MODEL_fe514aa9268c40d79cbcce1266bc47c9",
+       "style": "IPY_MODEL_e19a4ef87232458081076551a2f030b2",
+       "value": "Loading checkpoint shards: 100%"
+      }
+     },
+     "a39a1b814f3d4fd5b535216f429ae000": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "a4b7e815e1ab4f808a22dc551e8a6b36": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "aa16bf7c3b94482295f85dec38155ee1": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "b2994422e6b34af9bff3d3c0cf1f161d": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "b469729a7d3140b0b035aaa8e7733186": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "bba76ebbf6d741fcbc92b3a210464899": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "children": [
+        "IPY_MODEL_9df482a97a8c48048c1b14d023c192be",
+        "IPY_MODEL_4910e7d802ec4b2683801e7ac17ca700",
+        "IPY_MODEL_5ab2c71dcf87422cbde965d6f0661ef0"
+       ],
+       "layout": "IPY_MODEL_eeb37297a8f94696bd754a96f0057b12"
+      }
+     },
+     "bc30bb99906245b18e4c752439fe8f03": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "bf691f1d350149d69dff8cf5f08dd9e8": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "dda49cfd9dd04af69ba91d10f8e4e175": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "e19a4ef87232458081076551a2f030b2": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "eeb37297a8f94696bd754a96f0057b12": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "f1d5bbc922ce40d6885cb8ee7f6f9e50": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "f486dd77a1a04c8295a25d2df0cd3f3f": {
+      "model_module": "@jupyter-widgets/output",
+      "model_module_version": "1.0.0",
+      "model_name": "OutputModel",
+      "state": {
+       "layout": "IPY_MODEL_dda49cfd9dd04af69ba91d10f8e4e175",
+       "outputs": [
+        {
+         "data": {
+          "text/html": "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Applying Weight Compression <span style=\"color: #729c1f; text-decoration-color: #729c1f\">━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━</span> <span style=\"color: #800080; text-decoration-color: #800080\">100%</span> <span style=\"color: #0068b5; text-decoration-color: #0068b5\">114/114</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:06:09</span> • <span style=\"color: #0068b5; text-decoration-color: #0068b5\">0:00:00</span>\n</pre>\n",
+          "text/plain": "Applying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m114/114\u001b[0m • \u001b[38;2;0;104;181m0:06:09\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n"
+         },
+         "metadata": {},
+         "output_type": "display_data"
+        }
+       ]
+      }
+     },
+     "f62810db78ab4b4ba44f59e98baf7333": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "description_width": ""
+      }
+     },
+     "fe514aa9268c40d79cbcce1266bc47c9": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
+     },
+     "ff880044976144c1b775423eaa85fcbb": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "CheckboxModel",
+      "state": {
+       "description": "Prepare INT8 model",
+       "disabled": false,
+       "layout": "IPY_MODEL_f1d5bbc922ce40d6885cb8ee7f6f9e50",
+       "style": "IPY_MODEL_bc30bb99906245b18e4c752439fe8f03",
+       "value": false
+      }
+     }
+    },
     "version_major": 2,
     "version_minor": 0
    }
diff --git a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
index ac960aa4303..4caa0e05abb 100644
--- a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
+++ b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
@@ -428,8 +428,7 @@
     "        \"chatglm2-6b\": {\n",
     "            \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n",
     "            \"group_size\": 128,\n",
-    "            \"ratio\": 0.72,\n",
-    "            \"ignored_scope\": nncf.IgnoredScope([\"__module.transformer/aten::index_67/Gather\"])\n",
+    "            \"ratio\": 0.72\n",
     "        },\n",
     "        \"qwen-7b-chat\": {\n",
     "            \"mode\": nncf.CompressWeightsMode.INT4_SYM, \n",
diff --git a/notebooks/254-llm-chatbot/converter.py b/notebooks/254-llm-chatbot/converter.py
index c54ada629f7..87210da4399 100644
--- a/notebooks/254-llm-chatbot/converter.py
+++ b/notebooks/254-llm-chatbot/converter.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from typing import Tuple, Optional
 import types
+from transformers.modeling_outputs import BaseModelOutputWithPast
 
 
 def flattenize_inputs(inputs):
@@ -172,6 +173,41 @@ def convert_qwen(pt_model: torch.nn.Module, model_path: Path):
     del pt_model
 
 
+@torch.jit.script_if_tracing
+def _chatglm2_get_context_layer(query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor):
+    mask = torch.zeros((query_layer.shape[-2], key_layer.shape[-2]), dtype=query_layer.dtype)
+    if query_layer.shape[2] == key_layer.shape[2]:
+        tmp_mask = torch.ones((query_layer.shape[-2], key_layer.shape[-2]), dtype=torch.bool).triu(diagonal=1)
+        mask.masked_fill_(tmp_mask, float("-inf"))
+
+    context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, attn_mask=mask)
+    return context_layer
+
+
+def _core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask):
+    query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+    if attention_mask is None:
+        context_layer = _chatglm2_get_context_layer(query_layer, key_layer, value_layer)
+    else:
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer, key_layer, value_layer, attention_mask
+        )
+    context_layer = context_layer.permute(2, 0, 1, 3)
+    new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+    context_layer = context_layer.reshape(*new_context_layer_shape)
+
+    return context_layer
+
+
+@torch.jit.script_if_tracing
+def _get_chatglm_attention_mask(input_ids, past_key):
+    mask = torch.zeros((input_ids.shape[1], past_key.shape[0] + input_ids.shape[1]), dtype=past_key.dtype)
+    if past_key.shape[0] == 0:
+        tmp_mask = torch.ones((input_ids.shape[1], past_key.shape[0] + input_ids.shape[1]), dtype=torch.bool).triu(diagonal=1)
+        mask.masked_fill_(tmp_mask, float("-inf"))
+    return mask
+
+
 def _chatglm_transformer_forward(
         self,
         input_ids,
@@ -245,6 +281,11 @@ def _chatglm_transformer_forward(
 
 def _patch_chatglm_forward(model: "PreTrainedModel"):
     model.transformer.forward = types.MethodType(_chatglm_transformer_forward, model.transformer)
+    for block in model.transformer.encoder.layers:
+        block.self_attention.core_attention.forward = types.MethodType(
+            _core_attention_forward, block.self_attention.core_attention
+        )
+
 
 def convert_chatglm2(pt_model: torch.nn.Module, model_path: Path):
     """