From 864aedb4f2f6f7a51eac0240f794fa35b7a70c65 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 18 Mar 2025 10:24:22 -0700
Subject: [PATCH 01/32] [DO NOT MERGE] 2.7 RC Test

---
 .jenkins/build.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 4a869d35a7..06d7cc38e6 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -22,11 +22,12 @@ sudo apt-get install -y pandoc
 #Install PyTorch Nightly for test.
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
-# sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
-# sudo pip3 install torch==2.6.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
+pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
 # sudo pip uninstall -y fbgemm-gpu torchrec
 # sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 
+
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm
 python -m spacy download de_core_news_sm

From 490e3b1e61b6b74f634a23b22b3df81238a11070 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 18 Mar 2025 11:24:42 -0700
Subject: [PATCH 02/32] Update .jenkins/build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 06d7cc38e6..3d6d5bbc24 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -23,7 +23,7 @@ sudo apt-get install -y pandoc
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
-pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
+pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
 # sudo pip uninstall -y fbgemm-gpu torchrec
 # sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 

From a7fcb92343413755e6324e1953c8c7c2777458d6 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 18 Mar 2025 13:11:59 -0700
Subject: [PATCH 03/32] Update .jenkins/build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 3d6d5bbc24..c66a4dc392 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -23,7 +23,7 @@ sudo apt-get install -y pandoc
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
-pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
+pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu118
 # sudo pip uninstall -y fbgemm-gpu torchrec
 # sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 

From 1daf4091eade4fc81f9a4fb6e607de13befd6c49 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 18 Mar 2025 14:06:22 -0700
Subject: [PATCH 04/32] Update build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index c66a4dc392..cd296cc245 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -23,7 +23,7 @@ sudo apt-get install -y pandoc
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
-pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu118
+pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 # sudo pip uninstall -y fbgemm-gpu torchrec
 # sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 

From 6e218dc9746e039cfda1d17f3918d9d0d284bdd8 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 18 Mar 2025 15:09:53 -0700
Subject: [PATCH 05/32] Update build.sh

---
 .jenkins/build.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index cd296cc245..e36135edab 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -23,9 +23,9 @@ sudo apt-get install -y pandoc
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
-pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-# sudo pip uninstall -y fbgemm-gpu torchrec
-# sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+pip3 install torch==2.7.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
+#sudo pip uninstall -y fbgemm-gpu torchrec
+#sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 
 
 # Install two language tokenizers for Translation with TorchText tutorial

From 635975687df20b443ed407a4d7365dd06e05cb53 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 18 Mar 2025 15:51:21 -0700
Subject: [PATCH 06/32] Update build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index e36135edab..477fd1d639 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -23,7 +23,7 @@ sudo apt-get install -y pandoc
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
-pip3 install torch==2.7.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
+pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 #sudo pip uninstall -y fbgemm-gpu torchrec
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 

From edd72401fb2dc5ca366f933d001279e7038846dd Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 19 Mar 2025 12:57:22 -0700
Subject: [PATCH 07/32] Update onnxscript in requirements (#3300)

---
 .ci/docker/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 89dd788ae7..f969963988 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -36,7 +36,7 @@ datasets
 transformers
 torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
 onnx
-onnxscript
+onnxscript>=0.2.2
 onnxruntime
 evaluate
 accelerate>=0.20.1

From 9a649ea62384380dccaf74c6c75595e160698624 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Fri, 21 Mar 2025 12:17:16 -0700
Subject: [PATCH 08/32] Update build.sh

---
 .jenkins/build.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 477fd1d639..069c21217f 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -22,8 +22,11 @@ sudo apt-get install -y pandoc
 #Install PyTorch Nightly for test.
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
-sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
+sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
+pip install git+https://github.com/pytorch/tensordict
+pip install git+https://github.com/pytorch/torchrl
+
 #sudo pip uninstall -y fbgemm-gpu torchrec
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 

From 83a87811a75b89e9497bbec630d4effa677926d5 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Fri, 21 Mar 2025 12:17:41 -0700
Subject: [PATCH 09/32] Update .jenkins/validate_tutorials_built.py

---
 .jenkins/validate_tutorials_built.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
index 984632156e..82d127ce27 100644
--- a/.jenkins/validate_tutorials_built.py
+++ b/.jenkins/validate_tutorials_built.py
@@ -50,8 +50,12 @@
     "intermediate_source/flask_rest_api_tutorial",
     "intermediate_source/text_to_speech_with_torchaudio",
     "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
+<<<<<<< HEAD
     "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
     "recipes_source/recipes/reasoning_about_shapes"
+=======
+    "advanced_source/semi_structured_sparse" # reenable after 3303 is fixed.
+>>>>>>> 4b04c9b (Update .jenkins/validate_tutorials_built.py)
 ]
 
 def tutorial_source_dirs() -> List[Path]:

From cfb27190924c0a4a61c6c66af6065b7e0cc04d01 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Fri, 21 Mar 2025 14:16:22 -0700
Subject: [PATCH 10/32] Update build.sh

---
 .jenkins/build.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 069c21217f..49e0668640 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -24,9 +24,8 @@ sudo apt-get install -y pandoc
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-pip install git+https://github.com/pytorch/tensordict
-pip install git+https://github.com/pytorch/torchrl
-
+pip install git+https://github.com/pytorch/tensordict.git@main --depth=1
+pip install git+https://github.com/pytorch/torchrl.git@main --depth=1
 #sudo pip uninstall -y fbgemm-gpu torchrec
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 

From 29f4c5648130c48b268180e64d037db71f3c5865 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Fri, 21 Mar 2025 17:10:36 -0700
Subject: [PATCH 11/32] Update .jenkins/build.sh

---
 .jenkins/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 49e0668640..70f841af8a 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -24,8 +24,8 @@ sudo apt-get install -y pandoc
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-pip install git+https://github.com/pytorch/tensordict.git@main --depth=1
-pip install git+https://github.com/pytorch/torchrl.git@main --depth=1
+pip install git+https://github.com/pytorch/tensordict
+pip install git+https://github.com/pytorch/torchrl
 #sudo pip uninstall -y fbgemm-gpu torchrec
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 

From 4eb24e15f03892184d8656a907bfd3b02beff17c Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 24 Mar 2025 09:51:43 -0700
Subject: [PATCH 12/32] Update build.sh

---
 .jenkins/build.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 70f841af8a..7208376475 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -24,8 +24,6 @@ sudo apt-get install -y pandoc
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-pip install git+https://github.com/pytorch/tensordict
-pip install git+https://github.com/pytorch/torchrl
 #sudo pip uninstall -y fbgemm-gpu torchrec
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 

From 45f2bd536ca2950a0d6773b9f6704939d17a194a Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 24 Mar 2025 10:01:57 -0700
Subject: [PATCH 13/32] Apply suggestions from code review

---
 .jenkins/build.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 7208376475..af795d3dad 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -24,7 +24,10 @@ sudo apt-get install -y pandoc
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-#sudo pip uninstall -y fbgemm-gpu torchrec
+sudo pip uninstall -y fbgemm-gpu torchrec
+sudo pip3 install -y https://download.pytorch.org/whl/nightly/fbgemm-gpu/
+sudo pip3 install -y https://download.pytorch.org/whl/nightly/torchrec/
+
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 
 

From c309e11f7c234aa63fbd4a093a6976b883cb7fa4 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 24 Mar 2025 11:05:13 -0700
Subject: [PATCH 14/32] Update build.sh

---
 .jenkins/build.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index af795d3dad..e40dd82ecc 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -25,9 +25,7 @@ sudo apt-get install -y pandoc
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 sudo pip uninstall -y fbgemm-gpu torchrec
-sudo pip3 install -y https://download.pytorch.org/whl/nightly/fbgemm-gpu/
-sudo pip3 install -y https://download.pytorch.org/whl/nightly/torchrec/
-
+sudo pip3 install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu126/
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 
 

From 3674238eb33da0b798fd9715f2a7ac4f3caab857 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 24 Mar 2025 13:34:28 -0700
Subject: [PATCH 15/32] Update requirements.txt

---
 .ci/docker/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index f969963988..0e95c62c6b 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -69,5 +69,5 @@ pycocotools
 semilearn==0.3.2
 torchao==0.5.0
 segment_anything==1.0
-torchrec==1.0.0; platform_system == "Linux"
+torchrec==1.1.0; platform_system == "Linux"
 fbgemm-gpu==1.1.0; platform_system == "Linux"

From d40f8550fd5095a96c21d8ca882f21ca3e52335d Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 24 Mar 2025 13:35:12 -0700
Subject: [PATCH 16/32] Update .jenkins/build.sh

---
 .jenkins/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index e40dd82ecc..f8916478ca 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -24,7 +24,7 @@ sudo apt-get install -y pandoc
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-sudo pip uninstall -y fbgemm-gpu torchrec
+sudo pip uninstall -y fbgemm-gpu
 sudo pip3 install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu126/
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 

From dbfe3da38a807acd9fdc81b0a0ad7e1c217e7372 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 24 Mar 2025 14:36:37 -0700
Subject: [PATCH 17/32] Update .jenkins/build.sh

---
 .jenkins/build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index f8916478ca..ccd3683779 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -26,6 +26,8 @@ sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl t
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 sudo pip uninstall -y fbgemm-gpu
 sudo pip3 install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu126/
+pip install tensordict-nightly
+pip install torchrl-nightly
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 
 

From 3885455aebedc2995196360362ce2a5c1f52e749 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@meta.com>
Date: Tue, 25 Mar 2025 17:35:34 -0400
Subject: [PATCH 18/32] Fix the AOTI example (#3306)

Summary: The compiled model run takes the same input as Eager. No need to explicitly compose args as a tuple.
---
 intermediate_source/torch_export_tutorial.py | 2 +-
 recipes_source/torch_export_aoti_python.py   | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/intermediate_source/torch_export_tutorial.py b/intermediate_source/torch_export_tutorial.py
index 3ca6d09a52..20b1b4023e 100644
--- a/intermediate_source/torch_export_tutorial.py
+++ b/intermediate_source/torch_export_tutorial.py
@@ -995,7 +995,7 @@ def forward(self, x):
 #    with torch.no_grad():
 #        pt2_path = torch._inductor.aoti_compile_and_package(ep)
 #
-#    # Load and run the .so file in Python.
+#    # Load and run the .pt2 file in Python.
 #    # To load and run it in a C++ environment, see:
 #    # https://pytorch.org/docs/main/torch.compiler_aot_inductor.html
 #    aoti_compiled = torch._inductor.aoti_load_package(pt2_path)
diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index c0cbb7e280..ff311f071e 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -176,7 +176,7 @@
 model_path = os.path.join(os.getcwd(), "resnet18.pt2")
 
 compiled_model = torch._inductor.aoti_load_package(model_path)
-example_inputs = (torch.randn(2, 3, 224, 224, device=device),)
+example_inputs = torch.randn(2, 3, 224, 224, device=device)
 
 with torch.inference_mode():
     output = compiled_model(example_inputs)
@@ -238,11 +238,11 @@ def timed(fn):
 
 torch._dynamo.reset()
 
-model = torch._inductor.aoti_load_package(model_path)
-example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
+compiled_model = torch._inductor.aoti_load_package(model_path)
+example_inputs = torch.randn(1, 3, 224, 224, device=device)
 
 with torch.inference_mode():
-    _, time_taken = timed(lambda: model(example_inputs))
+    _, time_taken = timed(lambda: compiled_model(example_inputs))
     print(f"Time taken for first inference for AOTInductor is {time_taken:.2f} ms")
 
 

From d7d29fe36b63cd03f159fdaea03bbe3f8cace7b2 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 25 Mar 2025 19:36:05 -0700
Subject: [PATCH 19/32] Update build.sh

---
 .jenkins/build.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index ccd3683779..7705a429cd 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -24,10 +24,10 @@ sudo apt-get install -y pandoc
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
 sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
 pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-sudo pip uninstall -y fbgemm-gpu
-sudo pip3 install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu126/
-pip install tensordict-nightly
-pip install torchrl-nightly
+#sudo pip uninstall -y fbgemm-gpu
+#sudo pip3 install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu126/
+#pip install tensordict-nightly
+#pip install torchrl-nightly
 #sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
 
 

From f2fcf6f4b4815e828ffe0504c78ca99a60ca9720 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 25 Mar 2025 19:45:54 -0700
Subject: [PATCH 20/32] Disable rl tutorials again

---
 .jenkins/validate_tutorials_built.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
index 82d127ce27..984632156e 100644
--- a/.jenkins/validate_tutorials_built.py
+++ b/.jenkins/validate_tutorials_built.py
@@ -50,12 +50,8 @@
     "intermediate_source/flask_rest_api_tutorial",
     "intermediate_source/text_to_speech_with_torchaudio",
     "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
-<<<<<<< HEAD
     "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
     "recipes_source/recipes/reasoning_about_shapes"
-=======
-    "advanced_source/semi_structured_sparse" # reenable after 3303 is fixed.
->>>>>>> 4b04c9b (Update .jenkins/validate_tutorials_built.py)
 ]
 
 def tutorial_source_dirs() -> List[Path]:

From b87d98dc75f53da321208e4713087498edb0ac8d Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Tue, 8 Apr 2025 11:07:47 -0700
Subject: [PATCH 21/32] Add Context Parallel tutorial

---
 prototype_source/context_parallel.rst | 208 ++++++++++++++++++++++++++
 prototype_source/prototype_index.rst  |   9 ++
 2 files changed, 217 insertions(+)
 create mode 100644 prototype_source/context_parallel.rst

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
new file mode 100644
index 0000000000..0a65c68719
--- /dev/null
+++ b/prototype_source/context_parallel.rst
@@ -0,0 +1,208 @@
+Introduction to Context Parallel
+======================================
+**Authors**: `Xilun Wu <https://github.com/XilunWu>`_, `Chien-Chin Huang <https://github.com/fegin>`__
+
+.. note::
+    |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/context_parallel.rst>`__.
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :class-card: card-prerequisites
+
+      * `Context Parallel APIs <https://pytorch.org/docs/stable/distributed.tensor.html#torch.distributed.tensor.experimental.context_parallel>`__
+      * `1M sequence training in torchtitan with Context Parallel <https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082>`__
+
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :class-card: card-prerequisites
+
+      * PyTorch 2.7 or later
+
+
+Introduction
+------------
+
+Context Parallel is an approach used in LLM to reduce peak activation size by sharding the long input sequence across multiple devices.
+It breaks the constraint on input sequence length resulting from peak memory usage on storing activations in Transformer blocks.
+
+The core of Context Parallel is Ring Attention, a novel parallel implementation of the Attention layer.
+Ring Attention shuffles the KV shards and calculates the partial attention scores,
+repeats until all KV shards have been used on each device.
+We implemented two Ring Attention variants: `pass-KV <https://arxiv.org/abs/2411.01783>`__ and `all-to-all <https://openreview.net/forum?id=WsRHpHH4s0>`__.
+The pass-KV approach all-gathers KV shards while performing the local SDPA (Scaled Dot Product Attention) then performs the rest when the communication completes.
+The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA computation and the all-to-all communication
+necessary for the next SDPA.
+
+The Context Parallel APIs consist of two parts:
+
+1. ``context_parallel()`` allows users to create a Python context where the SDPA function (``torch.nn.functional.scaled_dot_product_attention``)
+will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to
+argument ``buffers`` and ``buffer_seq_dims`` respectively.
+2. ``set_rotate_method()`` allows users to choose between the pass-KV approach and the all-to-all approach.
+
+
+Setup
+---------------------
+
+With ``torch.distributed.tensor.experimental.context_parallel()``, users can easily shard the Tensor input and parallelize the execution of the SDPA function.
+To better demonstrate the usage of this API, we start with a simple code snippet doing SDPA and then parallelize it using the API:
+
+.. code:: python
+
+    import torch
+    import torch.nn.functional as F
+
+    from torch.nn.attention import sdpa_kernel, SDPBackend
+
+
+    def sdpa_example():
+        assert torch.cuda.is_available()
+        torch.cuda.set_device("cuda:0")
+        torch.cuda.manual_seed(0)
+
+        batch = 8
+        nheads = 8
+        qkv_len = 8192
+        dim = 32
+        backend = SDPBackend.FLASH_ATTENTION
+        dtype = (
+            torch.bfloat16
+            if backend == SDPBackend.FLASH_ATTENTION
+            or backend == SDPBackend.CUDNN_ATTENTION
+            else torch.float32
+        )
+
+        qkv = [
+            torch.rand(
+                (batch, nheads, qkv_len, dim),
+                dtype=dtype,
+                requires_grad=True,
+                device='cuda',
+            )
+            for _ in range(3)
+        ]
+
+        with sdpa_kernel(backend):
+            out = F.scaled_dot_product_attention(*qkv, is_causal=True)
+
+
+    if __name__ == "__main__":
+        sdpa_example()
+
+
+Enable Context Parallel
+-----------------------
+
+Now, let's first adapt it to a distributed program where each rank has the same tensor input. Then we apply the context parallel API to
+shard to input and distribute the computation across ranks:
+
+.. code:: python
+
+    # file: cp_sdpa_example.py
+    import os
+
+    import torch
+    import torch.distributed as dist
+    import torch.nn.functional as F
+    from torch.distributed.device_mesh import init_device_mesh
+    from torch.distributed.tensor.experimental import context_parallel
+    from torch.distributed.tensor.experimental._attention import context_parallel_unshard
+    from torch.nn.attention import sdpa_kernel, SDPBackend
+
+
+    def context_parallel_sdpa_example(world_size: int, rank: int):
+        assert torch.cuda.is_available()
+        assert dist.is_nccl_available()
+        torch.cuda.set_device(f"cuda:{rank}")
+        torch.cuda.manual_seed(0)
+
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            world_size=world_size,
+            rank=rank,
+        )
+        device_mesh = init_device_mesh(
+            device_type="cuda", mesh_shape=(world_size,), mesh_dim_names=("cp",)
+        )
+
+        batch = 8
+        nheads = 8
+        qkv_len = 64
+        dim = 32
+        backend = SDPBackend.FLASH_ATTENTION
+        dtype = (
+            torch.bfloat16
+            if backend == SDPBackend.FLASH_ATTENTION
+            or backend == SDPBackend.CUDNN_ATTENTION
+            else torch.float32
+        )
+
+        qkv = [
+            torch.rand(
+                (batch, nheads, qkv_len, dim),
+                dtype=dtype,
+                requires_grad=True,
+                device='cuda',
+            )
+            for _ in range(3)
+        ]
+        cp_qkv = [t.detach().clone() for t in qkv]
+
+        with sdpa_kernel(backend):
+            with context_parallel(
+                device_mesh, buffers=tuple(cp_qkv), buffer_seq_dims=(2, 2, 2)
+            ):
+                cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True)
+
+            (cp_out,) = context_parallel_unshard(device_mesh, [cp_out], [2])
+            out = F.scaled_dot_product_attention(*qkv, is_causal=True)
+
+            assert torch.allclose(
+                cp_out,
+                out,
+                atol=(1e-08 if dtype == torch.float32 else 1e-03 * world_size),
+            )
+
+
+    if __name__ == "__main__":
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+
+        try:
+            context_parallel_sdpa_example(world_size, rank)
+        finally:
+            dist.barrier()
+            dist.destroy_process_group()
+
+
+You can use the command ``torchrun --standalone --nnodes=1 --nproc-per-node=4 cp_sdpa_example.py`` to launch the above context parallel
+SDPA on 4 GPUs. We demonstrate the nemuric correctness by comparing the output of Ring Attention to that of SDPA on a single GPU.
+
+
+Select Rotation Approach
+------------------------
+
+You can choose the desired shards rotation approach in Ring Attention by using ``torch.distributed.tensor.experimental._attention.set_rotate_method()``:
+
+.. code:: python
+
+    # file: cp_sdpa_example.py
+    from torch.distributed.tensor.experimental._attention import set_rotate_method
+
+    set_rotate_method("alltoall")  # rotate shards using all-to-all
+
+    with sdpa_kernel(backend):
+        with context_parallel(
+            device_mesh, buffers=tuple(cp_qkv), buffer_seq_dims=(2, 2, 2)
+        ):
+            cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True)
+
+
+Conclusion
+----------
+
+In this tutorial, have learned how to parallelize the SDPA computation along the sequence dimension easily with our Context Parallel APIs. For
+design and implementation details, performance analysis, and an end-to-end training example in `torchtitan <https://github.com/pytorch/torchtitan>`__,
+see our post on `PyTorch native long-context training <https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082>`__.
diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst
index 927f5f694b..f429d71f32 100644
--- a/prototype_source/prototype_index.rst
+++ b/prototype_source/prototype_index.rst
@@ -239,6 +239,14 @@ Prototype features are not available as part of binary distributions like PyPI o
    :link: ../prototype/flight_recorder_tutorial.html
    :tags: Distributed, Debugging, FlightRecorder
 
+.. Distributed
+.. customcarditem::
+   :header: Context Parallel Tutorial
+   :card_description: Parallelize the attention computation along sequence dimension
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/context_parallel.html
+   :tags: Distributed, Context Parallel
+
 .. Integration
 .. customcarditem::
    :header: Out-of-tree extension autoloading in Python
@@ -265,6 +273,7 @@ Prototype features are not available as part of binary distributions like PyPI o
 .. toctree::
    :hidden:
 
+   prototype/context_parallel.html
    prototype/fx_graph_mode_quant_guide.html
    prototype/fx_graph_mode_ptq_dynamic.html
    prototype/fx_graph_mode_ptq_static.html

From f75c9fdcc43048ff8a0dcd292cd75f2f8a03c80e Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Wed, 9 Apr 2025 16:32:08 -0700
Subject: [PATCH 22/32] fix typo

---
 prototype_source/context_parallel.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index 0a65c68719..fc50ee070e 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -3,7 +3,7 @@ Introduction to Context Parallel
 **Authors**: `Xilun Wu <https://github.com/XilunWu>`_, `Chien-Chin Huang <https://github.com/fegin>`__
 
 .. note::
-    |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/intermediate_source/context_parallel.rst>`__.
+    |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/prototype_source/context_parallel.rst>`__.
 
 .. grid:: 2
 

From dcf02def8e2cb3d5de3218dfa74f65580e56ae21 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:52:55 -0700
Subject: [PATCH 23/32] fix: address comment

---
 prototype_source/context_parallel.rst | 29 +++++++++++++++++++--------
 prototype_source/prototype_index.rst  |  1 -
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index fc50ee070e..a0e894e86b 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -23,7 +23,7 @@ Introduction to Context Parallel
 Introduction
 ------------
 
-Context Parallel is an approach used in LLM to reduce peak activation size by sharding the long input sequence across multiple devices.
+Context Parallel is an approach used in large language model training to reduce peak activation size by sharding the long input sequence across multiple devices.
 It breaks the constraint on input sequence length resulting from peak memory usage on storing activations in Transformer blocks.
 
 The core of Context Parallel is Ring Attention, a novel parallel implementation of the Attention layer.
@@ -82,7 +82,7 @@ To better demonstrate the usage of this API, we start with a simple code snippet
             )
             for _ in range(3)
         ]
-
+        # specify the SDPABackend to use
         with sdpa_kernel(backend):
             out = F.scaled_dot_product_attention(*qkv, is_causal=True)
 
@@ -148,22 +148,35 @@ shard to input and distribute the computation across ranks:
             )
             for _ in range(3)
         ]
+        # specify the SDPABackend to use
+        with sdpa_kernel(backend):
+            out = F.scaled_dot_product_attention(*qkv, is_causal=True)
+
+        # make a clean copy of QKV for output comparison
         cp_qkv = [t.detach().clone() for t in qkv]
 
         with sdpa_kernel(backend):
+            # This `context_parallel()` performs two actions:
+            # 1. shard the tensor objects in `buffers` in-place along the dimension
+            #    specified in `buffer_seq_dims`, the tensors in `buffers` and their
+            #    sharding dims in `buffer_seq_dims` are organized in the same order.
+            # 2. replace the execution of `F.scaled_dot_product_attention` with a
+            #    context-paralleled-enabled Ring Attention.
             with context_parallel(
                 device_mesh, buffers=tuple(cp_qkv), buffer_seq_dims=(2, 2, 2)
             ):
                 cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True)
 
+            # the output `cp_out` is still sharded in the same way as QKV
+            # the `context_parallel_unshard` API allows users to easily
+            # unshard to gain the full tensor.
             (cp_out,) = context_parallel_unshard(device_mesh, [cp_out], [2])
-            out = F.scaled_dot_product_attention(*qkv, is_causal=True)
 
-            assert torch.allclose(
-                cp_out,
-                out,
-                atol=(1e-08 if dtype == torch.float32 else 1e-03 * world_size),
-            )
+        assert torch.allclose(
+            cp_out,
+            out,
+            atol=(1e-08 if dtype == torch.float32 else 1e-03 * world_size),
+        )
 
 
     if __name__ == "__main__":
diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst
index f429d71f32..80a517bef2 100644
--- a/prototype_source/prototype_index.rst
+++ b/prototype_source/prototype_index.rst
@@ -239,7 +239,6 @@ Prototype features are not available as part of binary distributions like PyPI o
    :link: ../prototype/flight_recorder_tutorial.html
    :tags: Distributed, Debugging, FlightRecorder
 
-.. Distributed
 .. customcarditem::
    :header: Context Parallel Tutorial
    :card_description: Parallelize the attention computation along sequence dimension

From 4275c42b0f3febaac61c737af2007e345efae2a8 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Thu, 10 Apr 2025 16:26:04 -0700
Subject: [PATCH 24/32] fix: typos

---
 prototype_source/context_parallel.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index a0e894e86b..9a6b62f2d0 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -82,7 +82,7 @@ To better demonstrate the usage of this API, we start with a simple code snippet
             )
             for _ in range(3)
         ]
-        # specify the SDPABackend to use
+        # specify the SDPBackend to use
         with sdpa_kernel(backend):
             out = F.scaled_dot_product_attention(*qkv, is_causal=True)
 
@@ -148,7 +148,7 @@ shard to input and distribute the computation across ranks:
             )
             for _ in range(3)
         ]
-        # specify the SDPABackend to use
+        # specify the SDPBackend to use
         with sdpa_kernel(backend):
             out = F.scaled_dot_product_attention(*qkv, is_causal=True)
 
@@ -191,7 +191,7 @@ shard to input and distribute the computation across ranks:
 
 
 You can use the command ``torchrun --standalone --nnodes=1 --nproc-per-node=4 cp_sdpa_example.py`` to launch the above context parallel
-SDPA on 4 GPUs. We demonstrate the nemuric correctness by comparing the output of Ring Attention to that of SDPA on a single GPU.
+SDPA on 4 GPUs. We demonstrate the numeric correctness by comparing the output of Ring Attention to that of SDPA on a single GPU.
 
 
 Select Rotation Approach

From c6d8dfae8eaf942ab938f1da388129b50d861975 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Mon, 14 Apr 2025 10:56:44 -0700
Subject: [PATCH 25/32] address review comments

---
 prototype_source/context_parallel.rst | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index 9a6b62f2d0..a5c28859af 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -3,7 +3,7 @@ Introduction to Context Parallel
 **Authors**: `Xilun Wu <https://github.com/XilunWu>`_, `Chien-Chin Huang <https://github.com/fegin>`__
 
 .. note::
-    |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/prototype_source/context_parallel.rst>`__.
+    |edit| View and edit this tutorial in `GitHub <https://github.com/pytorch/tutorials/blob/main/prototype_source/context_parallel.rst>`__.
 
 .. grid:: 2
 
@@ -11,7 +11,7 @@ Introduction to Context Parallel
       :class-card: card-prerequisites
 
       * `Context Parallel APIs <https://pytorch.org/docs/stable/distributed.tensor.html#torch.distributed.tensor.experimental.context_parallel>`__
-      * `1M sequence training in torchtitan with Context Parallel <https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082>`__
+      * `1M sequence training in TorchTitan with Context Parallel <https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082>`__
 
 
    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
@@ -29,7 +29,7 @@ It breaks the constraint on input sequence length resulting from peak memory usa
 The core of Context Parallel is Ring Attention, a novel parallel implementation of the Attention layer.
 Ring Attention shuffles the KV shards and calculates the partial attention scores,
 repeats until all KV shards have been used on each device.
-We implemented two Ring Attention variants: `pass-KV <https://arxiv.org/abs/2411.01783>`__ and `all-to-all <https://openreview.net/forum?id=WsRHpHH4s0>`__.
+Two Ring Attention variants have been implemented: `pass-KV <https://arxiv.org/abs/2411.01783>`__ and `all-to-all <https://openreview.net/forum?id=WsRHpHH4s0>`__.
 The pass-KV approach all-gathers KV shards while performing the local SDPA (Scaled Dot Product Attention) then performs the rest when the communication completes.
 The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA computation and the all-to-all communication
 necessary for the next SDPA.
@@ -37,8 +37,8 @@ necessary for the next SDPA.
 The Context Parallel APIs consist of two parts:
 
 1. ``context_parallel()`` allows users to create a Python context where the SDPA function (``torch.nn.functional.scaled_dot_product_attention``)
-will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to
-argument ``buffers`` and ``buffer_seq_dims`` respectively.
+   will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to
+   argument ``buffers`` and ``buffer_seq_dims`` respectively.
 2. ``set_rotate_method()`` allows users to choose between the pass-KV approach and the all-to-all approach.
 
 
@@ -157,17 +157,17 @@ shard to input and distribute the computation across ranks:
 
         with sdpa_kernel(backend):
             # This `context_parallel()` performs two actions:
-            # 1. shard the tensor objects in `buffers` in-place along the dimension
+            # 1. Shard the tensor objects in `buffers` in-place along the dimension
             #    specified in `buffer_seq_dims`, the tensors in `buffers` and their
             #    sharding dims in `buffer_seq_dims` are organized in the same order.
-            # 2. replace the execution of `F.scaled_dot_product_attention` with a
+            # 2. Replace the execution of `F.scaled_dot_product_attention` with a
             #    context-paralleled-enabled Ring Attention.
             with context_parallel(
                 device_mesh, buffers=tuple(cp_qkv), buffer_seq_dims=(2, 2, 2)
             ):
                 cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True)
 
-            # the output `cp_out` is still sharded in the same way as QKV
+            # The output `cp_out` is still sharded in the same way as QKV
             # the `context_parallel_unshard` API allows users to easily
             # unshard to gain the full tensor.
             (cp_out,) = context_parallel_unshard(device_mesh, [cp_out], [2])
@@ -216,6 +216,6 @@ You can choose the desired shards rotation approach in Ring Attention by using `
 Conclusion
 ----------
 
-In this tutorial, have learned how to parallelize the SDPA computation along the sequence dimension easily with our Context Parallel APIs. For
-design and implementation details, performance analysis, and an end-to-end training example in `torchtitan <https://github.com/pytorch/torchtitan>`__,
+In this tutorial, we have learned how to parallelize the SDPA computation along the sequence dimension easily with our Context Parallel APIs. For
+design and implementation details, performance analysis, and an end-to-end training example in `TorchTitan <https://github.com/pytorch/torchtitan>`__,
 see our post on `PyTorch native long-context training <https://discuss.pytorch.org/t/distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel/215082>`__.

From a6938ff7652d0c0c6ee4e511066c8b00c56f1bdb Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Mon, 14 Apr 2025 14:18:36 -0700
Subject: [PATCH 26/32] address comments: improve pass-KV description

---
 prototype_source/context_parallel.rst | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index a5c28859af..947ca036da 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -27,19 +27,21 @@ Context Parallel is an approach used in large language model training to reduce
 It breaks the constraint on input sequence length resulting from peak memory usage on storing activations in Transformer blocks.
 
 The core of Context Parallel is Ring Attention, a novel parallel implementation of the Attention layer.
-Ring Attention shuffles the KV shards and calculates the partial attention scores,
-repeats until all KV shards have been used on each device.
-Two Ring Attention variants have been implemented: `pass-KV <https://arxiv.org/abs/2411.01783>`__ and `all-to-all <https://openreview.net/forum?id=WsRHpHH4s0>`__.
-The pass-KV approach all-gathers KV shards while performing the local SDPA (Scaled Dot Product Attention) then performs the rest when the communication completes.
-The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA computation and the all-to-all communication
-necessary for the next SDPA.
+Ring Attention shuffles the KV shards and calculates the partial attention scores, repeats until all KV shards have been used on each device.
+Two Ring Attention variants have been implemented: `the all-gather based pass-KV <https://arxiv.org/abs/2407.21783>`__ and `the all-to-all based pass-KV <https://openreview.net/forum?id=WsRHpHH4s0>`__:
+1.  The all-gather based pass-KV algorithm is used in Llama3 training, which initially performs an all-gather on the key and value tensors, followed by computing the attention output for the
+    local query tensor chunk. Our modified all-gather based pass-KV algorithm concurrently all-gathers KV shards and computes attention output for the local query tensor chunk
+    using local key and value tensor chunks, followed by a final computation of attention output for the local query tensor and remaining KV shards. This allows some degree of
+    overlap between the attention computation and the all-gather collective.
+2.  The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA computation and the all-to-all communication
+    necessary for the next SDPA.
 
 The Context Parallel APIs consist of two parts:
 
 1. ``context_parallel()`` allows users to create a Python context where the SDPA function (``torch.nn.functional.scaled_dot_product_attention``)
    will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to
    argument ``buffers`` and ``buffer_seq_dims`` respectively.
-2. ``set_rotate_method()`` allows users to choose between the pass-KV approach and the all-to-all approach.
+2. ``set_rotate_method()`` allows users to choose between the all-gather based pass-KV approach and the all-to-all based pass-KV approach.
 
 
 Setup
@@ -213,6 +215,9 @@ You can choose the desired shards rotation approach in Ring Attention by using `
             cp_out = F.scaled_dot_product_attention(*cp_qkv, is_causal=True)
 
 
+The default rotation approach is the all-gather based pass-KV.
+
+
 Conclusion
 ----------
 

From b74e6cc20ddc6efe846b29f0e69a0c545e7b3b82 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Mon, 14 Apr 2025 14:34:39 -0700
Subject: [PATCH 27/32] address comments: improve API description

---
 prototype_source/context_parallel.rst | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index 947ca036da..767a983f20 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -33,15 +33,16 @@ Two Ring Attention variants have been implemented: `the all-gather based pass-KV
     local query tensor chunk. Our modified all-gather based pass-KV algorithm concurrently all-gathers KV shards and computes attention output for the local query tensor chunk
     using local key and value tensor chunks, followed by a final computation of attention output for the local query tensor and remaining KV shards. This allows some degree of
     overlap between the attention computation and the all-gather collective.
-2.  The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA computation and the all-to-all communication
+2.  The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA (Scaled Dot Product Attention) computation and the all-to-all communication
     necessary for the next SDPA.
 
 The Context Parallel APIs consist of two parts:
 
-1. ``context_parallel()`` allows users to create a Python context where the SDPA function (``torch.nn.functional.scaled_dot_product_attention``)
-   will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to
-   argument ``buffers`` and ``buffer_seq_dims`` respectively.
-2. ``set_rotate_method()`` allows users to choose between the all-gather based pass-KV approach and the all-to-all based pass-KV approach.
+1.  ``context_parallel()`` allows users to create a Python context where the SDPA function (``torch.nn.functional.scaled_dot_product_attention``)
+    will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to
+    argument ``buffers`` and ``buffer_seq_dims`` respectively. We recommend that users add tensors computing along the sequence dimension to ``buffers``
+    and shard them along this dimension.
+2.  ``set_rotate_method()`` allows users to choose between the all-gather based pass-KV approach and the all-to-all based pass-KV approach.
 
 
 Setup

From 02d419c1910df43a78296668ae3b678e8d9f9f40 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Mon, 14 Apr 2025 14:36:58 -0700
Subject: [PATCH 28/32] address comments: improve API description

---
 prototype_source/context_parallel.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index 767a983f20..3c86b8120d 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -32,7 +32,7 @@ Two Ring Attention variants have been implemented: `the all-gather based pass-KV
 1.  The all-gather based pass-KV algorithm is used in Llama3 training, which initially performs an all-gather on the key and value tensors, followed by computing the attention output for the
     local query tensor chunk. Our modified all-gather based pass-KV algorithm concurrently all-gathers KV shards and computes attention output for the local query tensor chunk
     using local key and value tensor chunks, followed by a final computation of attention output for the local query tensor and remaining KV shards. This allows some degree of
-    overlap between the attention computation and the all-gather collective.
+    overlap between the attention computation and the all-gather collective. For example, in the case of Llama3 training, we also shard ``freq_cis`` over the sequence dimension.
 2.  The all-to-all approach uses interleaved all-to-all collectives to ring shuffle KV shards to overlap the SDPA (Scaled Dot Product Attention) computation and the all-to-all communication
     necessary for the next SDPA.
 

From 80f228c46b287befdbf12741ea38293c4c027e18 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Mon, 14 Apr 2025 16:56:39 -0700
Subject: [PATCH 29/32] fix indentation

---
 prototype_source/context_parallel.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index 3c86b8120d..ed874917c5 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -29,6 +29,7 @@ It breaks the constraint on input sequence length resulting from peak memory usa
 The core of Context Parallel is Ring Attention, a novel parallel implementation of the Attention layer.
 Ring Attention shuffles the KV shards and calculates the partial attention scores, repeats until all KV shards have been used on each device.
 Two Ring Attention variants have been implemented: `the all-gather based pass-KV <https://arxiv.org/abs/2407.21783>`__ and `the all-to-all based pass-KV <https://openreview.net/forum?id=WsRHpHH4s0>`__:
+
 1.  The all-gather based pass-KV algorithm is used in Llama3 training, which initially performs an all-gather on the key and value tensors, followed by computing the attention output for the
     local query tensor chunk. Our modified all-gather based pass-KV algorithm concurrently all-gathers KV shards and computes attention output for the local query tensor chunk
     using local key and value tensor chunks, followed by a final computation of attention output for the local query tensor and remaining KV shards. This allows some degree of

From 0432a238069d1b04f3fe3edbc08187a7d73bf1b2 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Tue, 15 Apr 2025 13:24:26 -0700
Subject: [PATCH 30/32] address review comments

---
 prototype_source/context_parallel.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prototype_source/context_parallel.rst b/prototype_source/context_parallel.rst
index ed874917c5..46f2f2e864 100644
--- a/prototype_source/context_parallel.rst
+++ b/prototype_source/context_parallel.rst
@@ -26,7 +26,7 @@ Introduction
 Context Parallel is an approach used in large language model training to reduce peak activation size by sharding the long input sequence across multiple devices.
 It breaks the constraint on input sequence length resulting from peak memory usage on storing activations in Transformer blocks.
 
-The core of Context Parallel is Ring Attention, a novel parallel implementation of the Attention layer.
+Ring Attention, a novel parallel implementation of the Attention layer, is critical to performant Context Parallel.
 Ring Attention shuffles the KV shards and calculates the partial attention scores, repeats until all KV shards have been used on each device.
 Two Ring Attention variants have been implemented: `the all-gather based pass-KV <https://arxiv.org/abs/2407.21783>`__ and `the all-to-all based pass-KV <https://openreview.net/forum?id=WsRHpHH4s0>`__:
 
@@ -42,7 +42,7 @@ The Context Parallel APIs consist of two parts:
 1.  ``context_parallel()`` allows users to create a Python context where the SDPA function (``torch.nn.functional.scaled_dot_product_attention``)
     will be automatically replaced with Ring Attention. To shard Tensors along a dimension, simply pass the Tensors and their sharding dimensions to
     argument ``buffers`` and ``buffer_seq_dims`` respectively. We recommend that users add tensors computing along the sequence dimension to ``buffers``
-    and shard them along this dimension.
+    and shard them along this dimension. Taking Llama3 training as an example, missing ``freq_cis`` in ``buffers`` will result in a miscalculated rotary embedding.
 2.  ``set_rotate_method()`` allows users to choose between the all-gather based pass-KV approach and the all-to-all based pass-KV approach.
 
 

From ada3e08ead70e9863c0ba2a3d79c68aa4c5d0f62 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Wed, 16 Apr 2025 12:15:25 -0700
Subject: [PATCH 31/32] manually fix rebase issues

---
 .ci/docker/requirements.txt                  |  4 ++--
 .jenkins/build.sh                            | 12 ++++--------
 intermediate_source/torch_export_tutorial.py |  2 +-
 recipes_source/torch_export_aoti_python.py   |  6 +++---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 0e95c62c6b..89dd788ae7 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -36,7 +36,7 @@ datasets
 transformers
 torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
 onnx
-onnxscript>=0.2.2
+onnxscript
 onnxruntime
 evaluate
 accelerate>=0.20.1
@@ -69,5 +69,5 @@ pycocotools
 semilearn==0.3.2
 torchao==0.5.0
 segment_anything==1.0
-torchrec==1.1.0; platform_system == "Linux"
+torchrec==1.0.0; platform_system == "Linux"
 fbgemm-gpu==1.1.0; platform_system == "Linux"
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 7705a429cd..4a869d35a7 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -22,14 +22,10 @@ sudo apt-get install -y pandoc
 #Install PyTorch Nightly for test.
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # Install 2.5 to merge all 2.4 PRs - uncomment to install nightly binaries (update the version as needed).
-sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata torchrl tensordict
-pip3 install torch==2.7.0 torchvision torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-#sudo pip uninstall -y fbgemm-gpu
-#sudo pip3 install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu126/
-#pip install tensordict-nightly
-#pip install torchrl-nightly
-#sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu126
-
+# sudo pip uninstall -y torch torchvision torchaudio torchtext torchdata
+# sudo pip3 install torch==2.6.0 torchvision --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
+# sudo pip uninstall -y fbgemm-gpu torchrec
+# sudo pip3 install fbgemm-gpu==1.1.0 torchrec==1.0.0 --no-cache-dir --index-url https://download.pytorch.org/whl/test/cu124
 
 # Install two language tokenizers for Translation with TorchText tutorial
 python -m spacy download en_core_web_sm
diff --git a/intermediate_source/torch_export_tutorial.py b/intermediate_source/torch_export_tutorial.py
index 20b1b4023e..3ca6d09a52 100644
--- a/intermediate_source/torch_export_tutorial.py
+++ b/intermediate_source/torch_export_tutorial.py
@@ -995,7 +995,7 @@ def forward(self, x):
 #    with torch.no_grad():
 #        pt2_path = torch._inductor.aoti_compile_and_package(ep)
 #
-#    # Load and run the .pt2 file in Python.
+#    # Load and run the .so file in Python.
 #    # To load and run it in a C++ environment, see:
 #    # https://pytorch.org/docs/main/torch.compiler_aot_inductor.html
 #    aoti_compiled = torch._inductor.aoti_load_package(pt2_path)
diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index ff311f071e..b8d556093a 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -238,11 +238,11 @@ def timed(fn):
 
 torch._dynamo.reset()
 
-compiled_model = torch._inductor.aoti_load_package(model_path)
-example_inputs = torch.randn(1, 3, 224, 224, device=device)
+model = torch._inductor.aoti_load_package(model_path)
+example_inputs = (torch.randn(1, 3, 224, 224, device=device),)
 
 with torch.inference_mode():
-    _, time_taken = timed(lambda: compiled_model(example_inputs))
+    _, time_taken = timed(lambda: model(example_inputs))
     print(f"Time taken for first inference for AOTInductor is {time_taken:.2f} ms")
 
 

From 5872433cd167fa477692b3c5a502a3124e7c55ab Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Wed, 16 Apr 2025 12:16:31 -0700
Subject: [PATCH 32/32] manually fix rebase issues

---
 recipes_source/torch_export_aoti_python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/torch_export_aoti_python.py b/recipes_source/torch_export_aoti_python.py
index b8d556093a..c0cbb7e280 100644
--- a/recipes_source/torch_export_aoti_python.py
+++ b/recipes_source/torch_export_aoti_python.py
@@ -176,7 +176,7 @@
 model_path = os.path.join(os.getcwd(), "resnet18.pt2")
 
 compiled_model = torch._inductor.aoti_load_package(model_path)
-example_inputs = torch.randn(2, 3, 224, 224, device=device)
+example_inputs = (torch.randn(2, 3, 224, 224, device=device),)
 
 with torch.inference_mode():
     output = compiled_model(example_inputs)