ROCm · ipanfilo · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
@@ -53,7 +53,11 @@ jobs:
            || github.actor == 'lhb8125'
            || github.actor == 'kunlunl'
            || github.actor == 'pstjohn'
-           || github.actor == 'mk-61'
+           || github.actor == 'vcherepanov-nv'
+           || github.actor == 'tdophung'
+           || github.actor == 'vthumbe1503'
+           || github.actor == 'janekb04'
+           || github.actor == 'shengfangd'
          )
     steps:
       - name: Check if comment is issued by authorized person

@@ -20,3 +20,6 @@
 [submodule "examples/pytorch/nanogpt"]
 	path = examples/pytorch/nanogpt
 	url = https://github.com/floraamd/nanoGPTwTE.git
+[submodule "3rdparty/cutlass"]
+	path = 3rdparty/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
@@ -526,15 +526,15 @@ For example to use the NGC PyTorch container interactively,
 
 .. code-block:: bash
 
-    docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:25.04-py3
+    docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:25.08-py3
 
 For example to use the NGC JAX container interactively,
 
 .. code-block:: bash
 
-    docker run --gpus all -it --rm nvcr.io/nvidia/jax:25.04-py3
+    docker run --gpus all -it --rm nvcr.io/nvidia/jax:25.08-py3
 
-Where 25.04 (corresponding to April 2025 release) is the container version.
+Where 25.08 (corresponding to August 2025 release) is the container version.
 
 **Benefits of using NGC containers:**
 

@@ -9,11 +9,11 @@
 import torch
 import nvtx
 import transformer_engine
-from tests.pytorch.fused_attn.test_fused_attn import (
+from tests.pytorch.utils import (
     ModelConfig,
-    _get_attention_backends,
-    _run_dot_product_attention,
+    get_available_attention_backends,
 )
+from tests.pytorch.attention.test_attention import _run_dot_product_attention
 
 pd.set_option("display.precision", 4)
 
@@ -197,7 +197,7 @@ def main():
     )
     for model in model_configs.keys():
         config = model_configs[model]
-        available_backends, fused_attn_backends = _get_attention_backends(
+        available_backends, _, fused_attn_backends = get_available_attention_backends(
             config,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,

@@ -1,5 +1,5 @@
 # This file was modified for portability to AMDGPU
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # 
 # See LICENSE for license information.
@@ -13,17 +13,17 @@
 import transformer_engine
 from transformer_engine_torch import NVTE_Fused_Attn_Backend
 
-# Add test_fused_attn to the sys path 
+# Add TE repo root to the sys path 
 tests_path = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "../../tests/pytorch/fused_attn")
+    os.path.join(os.path.dirname(__file__), "../../")
 )
 sys.path.append(tests_path)
 
-from test_fused_attn import (
+from tests.pytorch.utils import (
     ModelConfig,
-    _get_attention_backends,
-    _run_dot_product_attention,
+    get_available_attention_backends,
 )
+from tests.pytorch.attention.test_attention import _run_dot_product_attention
 
 pd.set_option("display.precision", 4)
 
@@ -46,12 +46,12 @@
 is_training = True
 
 model_configs = {
-    #   test:             b,  h, hg,   d,   sq,  skv,   p,     mask,              bias
-    "test_0": ModelConfig(2, 16, 16, 64, 512, 512, 0.0, "no_mask", "no_bias"),  # short seq
-    "test_1": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, "causal", "no_bias"),  # longer seq, mask
-    "test_2": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, "causal", "post_scale_bias"),  # bias
-    "test_3": ModelConfig(2, 32, 4, 128, 8192, 8192, 0.0, "causal", "no_bias"),  # GQA
-    "test_4": ModelConfig(2, 128, 8, 128, 8192, 8192, 0.0, "causal_bottom_right", "no_bias")
+    #                     b, sq,  h, dqk
+    "test_0": ModelConfig(2, 512, 16, 64),  # short seq
+    "test_1": ModelConfig(2, 2048, 16, 128, attn_mask_type="causal"),  # longer seq, mask
+    "test_2": ModelConfig(2, 2048, 16, 128, attn_mask_type="causal", attn_bias_type="post_scale_bias"),  # bias
+    "test_3": ModelConfig(2, 8192, 32, 128, num_gqa_groups=4, attn_mask_type="causal"),  # GQA
+    "test_4": ModelConfig(2, 8192, 128, 128, num_gqa_groups=8, attn_mask_type="causal_bottom_right")
 }
 
 # DataFrame indices and columns for results
@@ -303,7 +303,7 @@ def sanity_checks(
     }
 
     for model, cfg in model_configs.items():
-        avail, _, fused_bes = _get_attention_backends(
+        avail, _, fused_bes = get_available_attention_backends(
             cfg,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,
@@ -364,7 +364,7 @@ def main(args):
     # Benchmarking starts..
     for model in model_configs.keys():
         config = model_configs[model]
-        available_backends, _, fused_attn_backends = _get_attention_backends(
+        available_backends, _, fused_attn_backends = get_available_attention_backends(
             config,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,

@@ -247,7 +247,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
     num_gemms_list = [8]
 
     if args.profile:
-        mkns = [(4096, 4096, 4096)]
+        mkns = [(4096 * 8, 4096, 4096)]
         # in profile mode, only run one recipe specified in args.recipe
         assert args.recipe != "all", (
             "In profile mode, only one recipe can be specified, please specify the recipe as"

@@ -1 +1 @@
-2.6.0.dev0
+2.8.0.dev0
@@ -27,20 +27,7 @@
 
 def install_requirements() -> List[str]:
     """Install dependencies for TE/PyTorch extensions."""
-    reqs = ["einops"]
-    if not rocm_build():
-        reqs.append(
-            "nvdlfw-inspect @"
-            " git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git@v0.1#egg=nvdlfw-inspect"
-        )
-    reqs.extend(
-        [
-            "torch>=2.1",
-            "onnx",
-            "onnxscript@git+https://github.com/microsoft/onnxscript.git@51ecf47523ef079c53b0e620c62d56d70cfd3871",
-        ]
-    )
-    return reqs
+    return ["torch>=2.1", "einops", "onnxscript==0.3.1", "onnx"]
 
 
 def test_requirements() -> List[str]:

@@ -16,7 +16,7 @@
 import subprocess
 import sys
 from pathlib import Path
-from importlib.metadata import version
+from importlib.metadata import version as get_version
 from subprocess import CalledProcessError
 from typing import List, Optional, Tuple, Union
 
@@ -340,7 +340,7 @@ def cuda_version() -> Tuple[int, ...]:
         return tuple(int(v) for v in version)
 
     try:
-        version_str = version("nvidia-cuda-runtime-cu12")
+        version_str = get_version("nvidia-cuda-runtime-cu12")
         version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit())
         return version_tuple
     except importlib.metadata.PackageNotFoundError:

@@ -1,5 +1,5 @@
 #!/bin/sh
-# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE for license information.
 
@@ -54,14 +54,14 @@ run_default_fa_lbl() {
 
 run_test_config() {
     echo ==== Run with Fused attention backend: $_fus_attn ====
+    export NVTE_JAX_UNITTEST_LEVEL=L0 # this env variable controls parameters set for some tests
     run_default_fa 1 test_custom_call_compute.py
     run_default_fa 1 test_functions.py
     run 1 test_fused_attn.py
     NVTE_CK_USES_FWD_V3=0 NVTE_CK_USES_BWD_V3=0 run_default_fa_lbl "v2" 3 test_fused_attn.py # Using FAv2 for forward and backward pass
     run_default_fa 1 test_helper.py
     run_default_fa 1 test_layer.py #it effectevly always uses unfused attention
     run_default_fa 1 test_sanity_import.py
-    run_default_fa 1 test_sharding.py
     run_default_fa 1 test_softmax.py
 }
 
@@ -76,8 +76,10 @@ run_test_config_mgpu() {
 
     if [ $_fus_attn = $_DEFAULT_FUSED_ATTN ]; then
         _dfa_level=2
+        export NVTE_JAX_UNITTEST_LEVEL=L1
     else
         _dfa_level=3
+        export NVTE_JAX_UNITTEST_LEVEL=L2
     fi
     run $_dfa_level test_distributed_fused_attn.py $_timeout_args
     run_default_fa 3 test_distributed_layernorm.py

@@ -1,5 +1,5 @@
 #!/bin/sh
-# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE for license information.
 
@@ -65,23 +65,23 @@ run_test_config(){
     run_default_fa 1 test_recipe.py
     run 1 test_sanity.py
     run_default_fa 1 test_sanity_import.py
-    run_default_fa 1 fused_attn/test_fused_attn.py # Backend selection is controlled by the test
+    run_default_fa 1 attention/test_attention.py # Backend selection is controlled by the test
+    run_default_fa 1 attention/test_cp_utils.py
+    run_default_fa 1 attention/test_kv_cache.py
     run_default_fa 1 triton_kernels/test_cast.py
     run_default_fa 1 triton_kernels/test_cast_mxfp8.py
     run_default_fa 1 triton_kernels/test_norm_common.py
     run_default_fa 1 triton_kernels/test_norms.py
     NVTE_TEST_TRITON_AUTOTUNE=1 run_default_fa_lbl "autotune" 3 triton_kernels/test_norms.py
     run_default_fa 1 test_parallel_cross_entropy.py
     NVTE_USE_DEQUANTIZE_TRITON=1 NVTE_USE_CAST_TRANSPOSE_TRITON=1 NVTE_USE_RMSNORM_TRITON=1 NVTE_USE_LAYERNORM_TRITON=1 run_default_fa_lbl "triton" 3 test_numerics.py
-    NVTE_USE_RMSNORM_TRITON=1 run_default_fa_lbl "triton" 1 test_fusible_ops.py
+    NVTE_USE_CAST_TRANSPOSE_TRITON=1 NVTE_USE_RMSNORM_TRITON=1 run_default_fa_lbl "triton" 1 test_fusible_ops.py
     NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa_lbl "triton" 1 test_float8_current_scaling_exact.py
-    NVTE_USE_ATOMIC_AMAX=1 run_default_fa 3 test_numerics.py
-    NVTE_USE_ATOMIC_AMAX=1 run_default_fa 3 test_fusible_ops.py
-    NVTE_USE_ATOMIC_AMAX=1 NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa 3 test_numerics.py
-    NVTE_USE_ATOMIC_AMAX=1 NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa 3 test_fusible_ops.py
-    NVTE_USE_ATOMIC_AMAX=0 NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa 3 test_numerics.py
-    NVTE_USE_ATOMIC_AMAX=0 NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa 3 test_fusible_ops.py
-    NVTE_USE_ATOMIC_AMAX=1 run_default_fa 3 triton_kernels/test_cast.py
+    NVTE_USE_ATOMIC_AMAX=1 run_default_fa_lbl "amax" 3 test_numerics.py
+    NVTE_USE_ATOMIC_AMAX=1 run_default_fa_lbl "amax" 3 test_fusible_ops.py
+    NVTE_USE_ATOMIC_AMAX=1 NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa_lbl "amax+triton" 3 test_numerics.py
+    NVTE_USE_ATOMIC_AMAX=1 NVTE_USE_CAST_TRANSPOSE_TRITON=1 run_default_fa_lbl "amax+triton" 3 test_fusible_ops.py
+    NVTE_USE_ATOMIC_AMAX=1 run_default_fa_lbl "amax" 3 triton_kernels/test_cast.py
 }
 
 run_test_config_mgpu(){
@@ -93,8 +93,8 @@ run_test_config_mgpu(){
     run_default_fa 2 distributed/test_numerics.py
     run_default_fa 1 distributed/test_torch_fsdp2.py
     run_default_fa 2 distributed/test_torch_fsdp2_fp8.py
-    run_default_fa_lbl "flash" 3 fused_attn/test_fused_attn_with_cp.py -k "with_flash"
-    run_default_fa_lbl "fused" 2 fused_attn/test_fused_attn_with_cp.py -k "with_fused"
+    run_default_fa_lbl "flash" 3 attention/test_attention_with_cp.py -k "with_flash"
+    run_default_fa_lbl "fused" 2 attention/test_attention_with_cp.py -k "with_fused"
 }
 
 run_benchmark() {

@@ -19,6 +19,10 @@ Variables are available in `transformer_engine.jax.sharding`.
 * JOINED_AXES: The logical axis of non-defined dimension. It is usually not sharded.
 
 
+Checkpointing
+------------------------------------
+When using checkpointing with Transformer Engine JAX, please be aware of the checkpointing policy being applied to your model. Any JAX checkpointing policy using `dot`, such as `jax.checkpoint_policies.dots_with_no_batch_dims`, may not work with GEMMs provided by Transformer Engine as they do not always use the `jax.lax.dot_general` primitive. Instead, you can use `transformer_engine.jax.checkpoint_policies.dots_and_te_gemms_with_no_batch_dims` or similar policies that are designed to work with Transformer Engine's GEMMs and `jax.lax.dot_general` GEMMs. You may also use any JAX policies that do not filter by primitive, such as `jax.checkpoint_policies.save_only_these_names` or `jax.checkpoint_policies.everything_saveable`.
+
 Modules
 ------------------------------------
 .. autoapiclass:: transformer_engine.jax.flax.TransformerLayerType

@@ -49,7 +49,7 @@ pyTorch
 
 .. autoapifunction:: transformer_engine.pytorch.moe_permute
 
-.. autoapifunction:: transformer_engine.pytorch.moe_permute_with_probs  
+.. autoapifunction:: transformer_engine.pytorch.moe_permute_with_probs
 
 .. autoapifunction:: transformer_engine.pytorch.moe_unpermute
 
@@ -63,3 +63,6 @@ pyTorch
 
 .. autoapifunction:: transformer_engine.pytorch.destroy_ub
 .. autoapifunction:: transformer_engine.pytorch.moe_sort_chunks_by_index
+
+.. autoapiclass:: transformer_engine.pytorch.UserBufferQuantizationMode
+  :members: FP8, NONE
@@ -21,7 +21,7 @@ Transformer Engine provides a set of precision debug tools which allow you to ea
 There are 4 things one needs to do to use Transformer Engine debug features:
 
 1. Create a configuration YAML file to configure the desired features.
-2. Import, and initialize the `Nvidia-DL-Framework-Inspect <https://github.com/NVIDIA/nvidia-dlfw-inspect>`_ tool, which is installed as the dependency of the Transformer Engine.
+2. Import, initialize, and install the `Nvidia-DL-Framework-Inspect <https://github.com/NVIDIA/nvidia-dlfw-inspect>`_ tool.
 3. One can pass ``name="..."`` when creating TE layers to easier identify layer names. If this is not provided, names will be inferred automatically.
 4. Invoke ``debug_api.step()`` at the end of one forward-backward pass.
 
@@ -141,7 +141,7 @@ Adjusting Python file
 In the modified code above, the following changes were made:
 
 1. Added an import for ``nvdlfw_inspect.api``.
-2. Initialized the Nvidia-DL-Framework-Inspect by calling ``debug_api.initialize()`` with appropriate configuration, specifying the path to the config file, feature directories, and log directory.
+2. Initialized the Nvidia-DL-Framework-Inspect by calling ``debug_api.initialize()`` with appropriate configuration, specifying the path to the config file, feature directories, and log directory. The directory with Transformer Engine features is located `here <https://github.com/NVIDIA/TransformerEngine/tree/main/transformer_engine/debug/features>`_. The full parameters description could be found :doc:`here <3_api_debug_setup>`.
 3. Added ``debug_api.step()`` after each of the forward-backward pass.
 
 Inspecting the logs
@@ -238,4 +238,4 @@ Let's run training and open TensorBoard by ``tensorboard --logdir=./tensorboard_
 .. figure:: ./img/tensorboard.png
    :align: center
 
-   Fig 2: TensorBoard with plotted stats.
+   Fig 2: TensorBoard with plotted stats.
@@ -12,14 +12,7 @@ Let's look deeper into how Nvidia-DL-Framework-Inspect with Transformer Engine w
 
    Fig 1: Example of Nvidia-DL-Framework-Inspect affecting training script with 1 Linear Layer. For tensors mentioned in ``config.yaml``, behavior of ``modify_tensor_enabled()`` and ``modify_tensor()`` calls are substituted with definitions from the feature class. Other calls return default values - in fact they do nothing.
 
-In this page, all calls from TransformerEngine to the Nvidia-DL-Framework-Inspect for each GEMM are listed. The order of these calls is illustrated in the image below.
-
-.. figure:: ./img/api_calls2.svg
-   :align: center
-
-   Fig 2: The calls to Nvidia-DL-Framework-Inspect done for Transformer Engine. There are 2 types of calls: GEMM calls and routing calls.
-
-
+In this page, all calls from TransformerEngine to the Nvidia-DL-Framework-Inspect for each GEMM are listed. 
 There are 2 categories of API calls, each is used for different purposes:
 
 - GEMM calls - invoked during every GEMM, used to process or quantize tensors and collect information about them,
@@ -32,14 +25,15 @@ if fusions happen. An important remark is that if no feature is used for the lay
 
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.modify_tensor
 
-.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor
-
-.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize
 
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.modify_tensor_enabled
 
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.fp8_gemm_enabled
 
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize
+
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_enabled
 
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize_enabled