diff --git a/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/IntelPyTorch_TrainingOptimizations_AMX_BF16.ipynb b/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/IntelPyTorch_TrainingOptimizations_AMX_BF16.ipynb index ae9fdb76a5..70962b2f27 100644 --- a/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/IntelPyTorch_TrainingOptimizations_AMX_BF16.ipynb +++ b/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/IntelPyTorch_TrainingOptimizations_AMX_BF16.ipynb @@ -118,7 +118,7 @@ "## Training the Model\n", "The function trainModel() will train the Resnet50 model based on the whether Intel® AMX should be enabled, and whether to use FP32 or BF16 data type. The environment variable `ONEDNN_MAX_CPU_ISA` is used to enable or disable Intel® AMX. **Note that this environment variable is only initialized once.** This means to run with Intel® AMX and VNNI, there will need to be separate processes. The best practice is to set this environment variable before running your script. For more information, refer to the [oneDNN documentation on CPU Dispatcher Control](https://www.intel.com/content/www/us/en/develop/documentation/onednn-developer-guide-and-reference/top/performance-profiling-and-inspection/cpu-dispatcher-control.html). \n", "\n", - "To use BF16 in operations, use the `torch.cpu.amp.autocast()` function to perform forward and backward propagation." + "To use BF16 in operations, use the `torch.amp.autocast('cpu')` function to perform forward and backward propagation." ] }, { @@ -128,7 +128,7 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ[\"ONEDNN_MAX_CPU_ISA\"] = \"AVX512_CORE_BF16\"" + "os.environ[\"ONEDNN_MAX_CPU_ISA\"] = \"AVX512_CORE_AMX\"" ] }, { @@ -171,7 +171,7 @@ " for batch_idx, (data, target) in enumerate(train_loader):\n", " optimizer.zero_grad()\n", " if \"bf16\" == dataType:\n", - " with torch.cpu.amp.autocast(): # Auto Mixed Precision\n", + " with torch.amp.autocast('cpu'): # Auto Mixed Precision\n", " # Setting memory_format to torch.channels_last could improve performance with 4D input data. This is optional.\n", " data = data.to(memory_format=torch.channels_last)\n", " output = model(data)\n", @@ -240,8 +240,8 @@ "## Training with FP32 and BF16, including Intel® AMX\n", "Train the Resnet50 model in three different cases:\n", "1. FP32 (baseline) \n", - "2. BF16 without Intel® AMX \n", - "3. BF16 with Intel® AMX \n", + "2. BF16 with Intel® AMX\n", + "x. BF16 without Intel® AMX\n", "\n", "The training time is recorded." ] @@ -260,12 +260,12 @@ { "cell_type": "code", "execution_count": null, - "id": "75aafe25-4f7d-42ad-92ed-3438bd78c00b", + "id": "3faaf5de", "metadata": {}, "outputs": [], "source": [ - "print(\"Training model with BF16 with AVX512\")\n", - "!python pytorch_training_avx512_bf16.py" + "print(\"Training model with BF16 with Intel® AMX\")\n", + "bf16_amx_training_time = trainModel(train_loader, modelName=\"bf16_withAmx\", dataType=\"bf16\")" ] }, { @@ -275,19 +275,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Record the training time for BF16 using AVX512\n", - "bf16_avx512_training_time = None #TODO: enter in training time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2fdc8a70-509a-4714-8524-084f34e287c3", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Training model with BF16 with Intel® AMX\")\n", - "bf16_amx_training_time = trainModel(train_loader, modelName=\"bf16_withAmx\", dataType=\"bf16\")" + "print(\"Training model with BF16 with AVX512\")\n", + "!python pytorch_training_avx512_bf16.py\n", + "\n", + "# Read the variable\n", + "with open('bf16_avx512_training_time.txt', 'r') as f:\n", + " bf16_avx512_training_time = float(f.read().strip())" ] }, { @@ -391,9 +384,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.10 64-bit (microsoft store)", + "display_name": "pytorch_test", "language": "python", - "name": "python3" + "name": "pytorch_test" }, "language_info": { "codemirror_mode": { @@ -405,12 +398,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "vscode": { - "interpreter": { - "hash": "ed6ae0d06e7bec0fef5f1fb38f177ceea45508ce95c68ed2f49461dd6a888a39" - } + "version": "3.11.0" } }, "nbformat": 4, diff --git a/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/pytorch_training_amx_bf16.py b/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/pytorch_training_amx_bf16.py deleted file mode 100644 index af9c7545f4..0000000000 --- a/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/pytorch_training_amx_bf16.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 - -''' -============================================================== - Copyright © 2022 Intel Corporation - - SPDX-License-Identifier: MIT -============================================================== -''' - -import os -from time import time -import matplotlib.pyplot as plt -import torch -import torchvision -import intel_extension_for_pytorch as ipex - -# Hyperparameters and constants -LR = 0.001 -MOMENTUM = 0.9 -DOWNLOAD = True -DATA = 'datasets/cifar10/' - -os.environ["ONEDNN_MAX_CPU_ISA"] = "AVX512_CORE_AMX" - -""" -Function to run a test case -""" -def trainModel(train_loader, modelName="myModel", dataType="fp32"): - """ - Input parameters - train_loader: a torch DataLoader object containing the training data - modelName: a string representing the name of the model - dataType: the data type for model parameters, supported values - fp32, bf16 - Return value - training_time: the time in seconds it takes to train the model - """ - - # Initialize the model - model = torchvision.models.resnet50() - model = model.to(memory_format=torch.channels_last) - criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM) - model.train() - - # Optimize with BF16 or FP32 (default) - if "bf16" == dataType: - model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) - else: - model, optimizer = ipex.optimize(model, optimizer=optimizer) - - # Train the model - num_batches = len(train_loader) - start_time = time() - for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - if "bf16" == dataType: - with torch.cpu.amp.autocast(): # Auto Mixed Precision - # Setting memory_format to torch.channels_last could improve performance with 4D input data. This is optional. - data = data.to(memory_format=torch.channels_last) - output = model(data) - loss = criterion(output, target) - loss.backward() - else: - # Setting memory_format to torch.channels_last could improve performance with 4D input data. This is optional. - data = data.to(memory_format=torch.channels_last) - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - if 0 == (batch_idx+1) % 50: - print("Batch %d/%d complete" %(batch_idx+1, num_batches)) - end_time = time() - training_time = end_time-start_time - print("Training took %.3f seconds" %(training_time)) - - # Save a checkpoint of the trained model - torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - }, 'checkpoint_%s.pth' %modelName) - - return training_time - -""" -Perform all types of training in main function -""" -def main(): - # Check if hardware supports AMX - import sys - sys.path.append('../../') - import version_check - from cpuinfo import get_cpu_info - info = get_cpu_info() - flags = info['flags'] - amx_supported = False - for flag in flags: - if "amx" in flag: - amx_supported = True - break - if not amx_supported: - print("AMX is not supported on current hardware. Code sample cannot be run.\n") - return - - # Load dataset - transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) - ]) - train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, - ) - train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 - ) - - # Train models and acquire training times - print("Training model with FP32") - fp32_training_time = trainModel(train_loader, modelName="fp32", dataType="fp32") - print("Training model with BF16 with AMX") - bf16_withAmx_training_time = trainModel(train_loader, modelName="bf16_withAmx", dataType="bf16") - - # Training time results - print("Summary") - print("FP32 training time: %.3f" %fp32_training_time) - print("BF16 with AMX training time: %.3f" %bf16_withAmx_training_time) - - # Create bar chart with training time results - plt.figure() - plt.title("ResNet Training Time") - plt.xlabel("Test Case") - plt.ylabel("Training Time (seconds)") - plt.bar(["FP32", "BF16 w/AMX"], [fp32_training_time, bf16_withAmx_training_time]) - - # Calculate speedup when using AMX - speedup_from_fp32 = fp32_training_time / bf16_withAmx_training_time - print("BF16 with AMX is %.2fX faster than FP32" %speedup_from_fp32) - - # Create bar chart with speedup results - plt.figure() - plt.title("AMX Speedup") - plt.xlabel("Test Case") - plt.ylabel("Speedup") - plt.bar(["FP32", "BF16 w/AMX"], [1, speedup_from_fp32]) - - plt.show() - -if __name__ == '__main__': - main() - print('[CODE_SAMPLE_COMPLETED_SUCCESSFULLY]') diff --git a/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/pytorch_training_avx512_bf16.py b/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/pytorch_training_avx512_bf16.py index b28ad49db8..a0cecd5098 100644 --- a/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/pytorch_training_avx512_bf16.py +++ b/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/pytorch_training_avx512_bf16.py @@ -56,7 +56,7 @@ def trainModel(train_loader, modelName="myModel", dataType="fp32"): for batch_idx, (data, target) in enumerate(train_loader): optimizer.zero_grad() if "bf16" == dataType: - with torch.cpu.amp.autocast(): # Auto Mixed Precision + with torch.amp.autocast('cpu'): # Auto Mixed Precision # Setting memory_format to torch.channels_last could improve performance with 4D input data. This is optional. data = data.to(memory_format=torch.channels_last) output = model(data) @@ -106,7 +106,11 @@ def main(): # Train models and acquire training times print("Training model with BF16 with AVX512") - bf16_noAmx_training_time = trainModel(train_loader, modelName="bf16_noAmx", dataType="bf16") + bf16_avx512_training_time = trainModel(train_loader, modelName="bf16_noAmx", dataType="bf16") + + # Save variable + with open('bf16_avx512_training_time.txt', 'w') as f: + f.write(str(bf16_avx512_training_time)) if __name__ == '__main__': main() diff --git a/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/sample.json b/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/sample.json index bebcc021cf..24f64b8422 100644 --- a/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/sample.json +++ b/AI-and-Analytics/Features-and-Functionality/IntelPyTorch_TrainingOptimizations_AMX_BF16/sample.json @@ -17,8 +17,6 @@ "conda activate pytorch", "python -m pip install -r requirements.txt", "python -m ipykernel install --user --name=pytorch", - "python pytorch_training_avx512_bf16.py", - "python pytorch_training_amx_bf16.py", "jupyter nbconvert --ExecutePreprocessor.enabled=True --ExecutePreprocessor.kernel_name=pytorch --to notebook IntelPyTorch_TrainingOptimizations_AMX_BF16.ipynb" ] }