pytorch
diff --git a/‎.github/workflows/update-quick-start-module.yml
Lines changed: 4 additions & 22 deletions b/‎.github/workflows/update-quick-start-module.yml
Lines changed: 4 additions & 22 deletions
diff --git a/‎_get_started/previous-versions.md
Lines changed: 43 additions & 0 deletions b/‎_get_started/previous-versions.md
Lines changed: 43 additions & 0 deletions
diff --git a/‎_includes/quick_start_local.html
Lines changed: 1 addition & 1 deletion b/‎_includes/quick_start_local.html
Lines changed: 1 addition & 1 deletion
diff --git a/‎_posts/2024-04-24-pytorch2-3.md
Lines changed: 106 additions & 0 deletions b/‎_posts/2024-04-24-pytorch2-3.md
Lines changed: 106 additions & 0 deletions
@@ -32,12 +32,6 @@ jobs:
       package-type: all
       os: windows
       channel: "nightly"
-  macos-nightly-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
-    with:
-      package-type: all
-      os: macos
-      channel: "nightly"
   macos-arm64-nightly-matrix:
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
     with:
@@ -58,13 +52,6 @@ jobs:
       package-type: all
       os: windows
       channel: "release"
-  macos-release-matrix:
-    needs: [macos-nightly-matrix]
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
-    with:
-      package-type: all
-      os: macos
-      channel: "release"
   macos-arm64-release-matrix:
     needs: [macos-arm64-nightly-matrix]
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
@@ -74,9 +61,8 @@ jobs:
       channel: "release"
 
   update-quick-start:
-    needs: [linux-nightly-matrix, windows-nightly-matrix, macos-nightly-matrix,
-      macos-arm64-nightly-matrix, linux-release-matrix, windows-release-matrix,
-      macos-release-matrix, macos-arm64-release-matrix]
+    needs: [linux-nightly-matrix, windows-nightly-matrix, macos-arm64-nightly-matrix,
+    linux-release-matrix, windows-release-matrix, macos-arm64-release-matrix]
     runs-on: "ubuntu-20.04"
     environment: pytorchbot-env
     steps:
@@ -92,22 +78,18 @@ jobs:
         env:
           LINUX_NIGHTLY_MATRIX: ${{ needs.linux-nightly-matrix.outputs.matrix }}
           WINDOWS_NIGHTLY_MATRIX: ${{ needs.windows-nightly-matrix.outputs.matrix }}
-          MACOS_NIGHTLY_MATRIX: ${{ needs.macos-nightly-matrix.outputs.matrix }}
-          MACOS_ARM64_NIGHTLY_MATRIX: ${{ needs.macos-arm64-nightly-matrix.outputs.matrix }}
+          MACOS_NIGHTLY_MATRIX: ${{ needs.macos-arm64-nightly-matrix.outputs.matrix }}
           LINUX_RELEASE_MATRIX: ${{ needs.linux-release-matrix.outputs.matrix }}
           WINDOWS_RELEASE_MATRIX: ${{ needs.windows-release-matrix.outputs.matrix }}
-          MACOS_RELEASE_MATRIX: ${{ needs.macos-release-matrix.outputs.matrix }}
-          MACOS_ARM64_RELEASE_MATRIX: ${{ needs.macos-arm64-release-matrix.outputs.matrix }}
+          MACOS_RELEASE_MATRIX: ${{ needs.macos-arm64-release-matrix.outputs.matrix }}
         run: |
           set -ex
           printf '%s\n' "$LINUX_NIGHTLY_MATRIX" > linux_nightly_matrix.json
           printf '%s\n' "$WINDOWS_NIGHTLY_MATRIX" > windows_nightly_matrix.json
           printf '%s\n' "$MACOS_NIGHTLY_MATRIX" > macos_nightly_matrix.json
-          printf '%s\n' "$MACOS_ARM64_NIGHTLY_MATRIX" > macos_arm64_nightly_matrix.json
           printf '%s\n' "$LINUX_RELEASE_MATRIX" > linux_release_matrix.json
           printf '%s\n' "$WINDOWS_RELEASE_MATRIX" > windows_release_matrix.json
           printf '%s\n' "$MACOS_RELEASE_MATRIX" > macos_release_matrix.json
-          printf '%s\n' "$MACOS_ARM64_RELEASE_MATRIX" > macos_arm64_release_matrix.json
           python3 ./scripts/gen_quick_start_module.py --autogenerate > assets/quick-start-module.js
           rm *_matrix.json
       - name: Create Issue if failed
 
@@ -17,6 +17,49 @@ your convenience.
 
 ## Commands for Versions >= 1.0.0
 
+### v2.2.2
+
+#### Conda
+
+##### OSX
+
+```
+# conda
+conda install pytorch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 -c pytorch
+```
+
+#####  Linux and Windows
+
+```
+# CUDA 11.8
+conda install pytorch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 pytorch-cuda=11.8 -c pytorch -c nvidia
+# CUDA 12.1
+conda install pytorch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 pytorch-cuda=12.1 -c pytorch -c nvidia
+# CPU Only
+conda install pytorch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 cpuonly -c pytorch
+```
+
+#### Wheel
+
+##### OSX
+
+```
+pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2
+```
+
+##### Linux and Windows
+
+```
+# ROCM 5.7 (Linux only)
+pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/rocm5.7
+# CUDA 11.8
+pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118
+# CUDA 12.1
+pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
+# CPU only
+pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
+```
+
 ### v2.2.1
 
 #### Conda
 
@@ -5,7 +5,7 @@
   <a href="{{ site.baseurl }}/get-started/previous-versions">install previous versions of PyTorch</a>. Note that LibTorch is only available for C++.
 </p>
 
-<p><b>NOTE:</b> Latest PyTorch requires Python 3.8 or later. For more details, see Python section below.</p>
+<p><b>NOTE:</b> Latest PyTorch requires Python 3.8 or later.</p>
 
 <div class="row">
   <div class="col-md-3 headings">
 
@@ -0,0 +1,106 @@
+---
+layout: blog_detail
+title: "PyTorch 2.3 Release Blog"
+---
+
+We are excited to announce the release of PyTorch® 2.3 ([release note](https://github.com/pytorch/pytorch/releases/tag/v2.3.0))! PyTorch 2.3 offers support for user-defined Triton kernels in torch.compile, allowing for users to migrate their own Triton kernels from eager without experiencing performance regressions or graph breaks. Tensor Parallelism improves the experience for training Large Language Models using native PyTorch functions, which has been validated on training runs for 100B parameter models. As well, semi-structured sparsity implements semi-structured sparsity as a Tensor subclass, with observed speedups of up to 1.6 over dense matrix multiplication.
+
+This release is composed of 3393 commits and 426 contributors since PyTorch 2.2. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try these out and report any issues as we improve 2.3. More information about how to get started with the PyTorch 2-series can be found at our [Getting Started](https://pytorch.org/get-started/pytorch-2.0/) page.
+
+
+<table class="table table-bordered">
+  <tr>
+   <td><strong>Beta</strong>
+   </td>
+   <td><strong>Prototype</strong>
+   </td>
+   <td><strong>Performance Improvements</strong>
+   </td>
+  </tr>
+  <tr>
+   <td>User-defined Triton kernels in torch.compile
+   </td>
+   <td>torch.export adds new API to specify dynamic_shapes
+   </td>
+   <td>Weight-Only-Quantization introduced into Inductor CPU backend
+   </td>
+  </tr>
+  <tr>
+   <td>Tensor parallelism within PyTorch Distributed
+   </td>
+   <td>Asynchronous checkpoint generation
+   </td>
+   <td>
+   </td>
+  </tr>
+  <tr>
+   <td>Support for semi-structured sparsity
+   </td>
+   <td>
+   </td>
+   <td>
+   </td>
+  </tr>
+</table>
+
+
+*To see a full list of public feature submissions click [here](https://docs.google.com/spreadsheets/d/1TzGkWuUMF1yTe88adz1dt2mzbIsZLd3PBasy588VWgk/edit?usp=sharing).
+
+
+## Beta Features
+
+
+### [Beta] Support for User-defined Triton kernels in _torch.compile_
+
+Allows for PyTorch code that contains triton kernels to be executed natively using torch.compile. This enables users to migrate code containing triton kernels from eager PyTorch to _torch.compile_ without running into performance regressions or graph breaks. Native support also creates an opportunity for Torch Inductor to precompile the user-defined Triton kernel as well as better organize code around the Triton kernel allowing for further optimizations.
+
+You can find more information about how to utilize user defined Triton kernels in torch.compile within [this tutorial](https://pytorch.org/tutorials/recipes/torch_compile_user_defined_triton_kernel_tutorial.html).
+
+
+### [Beta] Tensor Parallelism introduces more efficient ways to train LLMs
+
+The Tensor Parallel API facilitates various tensor manipulations across GPUs/hosts and integrates with FSDP for 2D Parallelism (Tensor parallelism across devices + Data Parallelism across hosts). It also offers a low-level API for constructing higher-level Tensor parallel APIs. This API has been validated to support the training of transformer models with over 100 billion parameters.
+
+You can find more information on how to utilize this within your workflows within [this tutorial](https://pytorch.org/tutorials/intermediate/TP_tutorial.html).
+
+
+### [Beta] Semi-structured sparsity provides users with a way to take advantage of accelerated sparse inference and memory savings
+
+_torch.sparse.SparseSemiStructuredTensor_ implements semi-structured sparsity as a Tensor subclass, which have observed speedups of up to 1.6 over dense matrix multiplication.
+
+In particular it adds:
+
+
+
+* Additional support for quantization composability (mixed dtype, dequant fusion)
+* Updated cuSPARSELt and CUTLASS kernels
+* torch.compile support
+
+You can find more information on how to take advantage of semi-structured sparsity [here](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html). 
+
+
+## Prototype Features
+
+
+### [PROTOTYPE] _torch.export_ adds new API to specify _dynamic_shapes_
+
+You can now use _torch.export.Dim_ to better represent dynamic shapes by enabling developers to specify ranges (min and max values) that can be reused across different input dimensions that are constrained to be equal.
+
+To learn more about _torch.export.Dim_ as well as how it can be used to express more interesting relationships (such as linear arithmetic expressions) check out the tutorial [here](https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html#constraints-dynamic-shapes).
+
+
+### [PROTOTYPE] Asynchronous checkpoint generation
+
+Asynchronous checkpoint generation allows users to continue their training loops while checkpoints are being generated, essentially offloading much of the checkpointing cost.
+
+You can find out how to utilize this within your own workflows with this [example](https://github.com/pytorch/pytorch/blob/release/2.3/torch/distributed/checkpoint/examples/async_checkpointing_example.py).
+
+
+## Performance Improvements
+
+
+### [PROTOTYPE] Weight-Only-Quantization introduced into Inductor CPU backend
+
+PyTorch 2.3 enhances LLM inference performance on torch inductor CPU backend. The project [gpt-fast](https://github.com/pytorch-labs/gpt-fast) offers a simple and efficient PyTorch native acceleration for transformer text generation with _torch.compile_. Prior to 2.3 only CUDA devices were supported and this feature enables the CPU counterpart by providing highly optimized kernels for the int4 and int8 weight only quantization Linear.
+
+For more information / how to utilize this feature please refer to the [gpt-fast README](https://github.com/pytorch-labs/gpt-fast#quantization).