|
984 | 984 | " encoder_calibration_data = []\n",
|
985 | 985 | " decoder_calibration_data = []\n",
|
986 | 986 | " ov_model.encoder.request = InferRequestWrapper(ov_model.encoder.request, encoder_calibration_data, apply_caching=True)\n",
|
987 |
| - " ov_model.decoder_with_past.request = InferRequestWrapper(ov_model.decoder_with_past.request,\n", |
| 987 | + " ov_model.decoder.request = InferRequestWrapper(ov_model.decoder.request,\n", |
988 | 988 | " decoder_calibration_data,\n",
|
989 | 989 | " apply_caching=True)\n",
|
990 | 990 | "\n",
|
|
996 | 996 | " ov_model.generate(input_features)\n",
|
997 | 997 | " finally:\n",
|
998 | 998 | " ov_model.encoder.request = ov_model.encoder.request.request\n",
|
999 |
| - " ov_model.decoder_with_past.request = ov_model.decoder_with_past.request.request\n", |
| 999 | + " ov_model.decoder.request = ov_model.decoder.request.request\n", |
1000 | 1000 | "\n",
|
1001 | 1001 | " return encoder_calibration_data, decoder_calibration_data"
|
1002 | 1002 | ]
|
|
1146 | 1146 | " gc.collect()\n",
|
1147 | 1147 | "\n",
|
1148 | 1148 | " print(\"Quantizing decoder with past\")\n",
|
1149 |
| - " quantized_decoder_with_past = nncf.quantize(\n", |
1150 |
| - " ov_model.decoder_with_past.model,\n", |
| 1149 | + " quantized_decoder = nncf.quantize(\n", |
| 1150 | + " ov_model.decoder.model,\n", |
1151 | 1151 | " nncf.Dataset(decoder_calibration_data),\n",
|
1152 | 1152 | " subset_size=len(decoder_calibration_data),\n",
|
1153 | 1153 | " model_type=nncf.ModelType.TRANSFORMER,\n",
|
1154 | 1154 | " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n",
|
1155 | 1155 | " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.95)\n",
|
1156 | 1156 | " )\n",
|
1157 |
| - " ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_with_past_model.xml\")\n", |
1158 |
| - " del quantized_decoder_with_past\n", |
| 1157 | + " ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_model.xml\")\n", |
| 1158 | + " del quantized_decoder\n", |
1159 | 1159 | " del decoder_calibration_data\n",
|
1160 | 1160 | " gc.collect()\n",
|
1161 | 1161 | "\n",
|
1162 | 1162 | " # Copy the config file and the first-step-decoder manually\n",
|
1163 | 1163 | " shutil.copy(model_path / \"config.json\", quantized_model_path / \"config.json\")\n",
|
1164 |
| - " shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_model_path / \"openvino_decoder_model.xml\")\n", |
1165 |
| - " shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_model_path / \"openvino_decoder_model.bin\")\n", |
1166 | 1164 | "\n",
|
1167 | 1165 | " quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, ov_config=ov_config, compile=False)\n",
|
1168 | 1166 | " quantized_ov_model.to(device.value)\n",
|
|
1392 | 1390 | " whole_infer_times = []\n",
|
1393 | 1391 | " time_fn(ov_model, \"generate\", whole_infer_times)\n",
|
1394 | 1392 | " time_fn(ov_model.encoder, \"forward\", encoder_infer_times)\n",
|
1395 |
| - " time_fn(ov_model.decoder_with_past, \"forward\", decoder_with_past_infer_times)\n", |
| 1393 | + " time_fn(ov_model.decoder, \"forward\", decoder_with_past_infer_times)\n", |
1396 | 1394 | "\n",
|
1397 | 1395 | " ground_truths = []\n",
|
1398 | 1396 | " predictions = []\n",
|
|
0 commit comments