From e62ad401b2b280440a0487f4dcd7c59f7de51565 Mon Sep 17 00:00:00 2001
From: Veera Rajasekhar Reddy Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Fri, 16 Jan 2026 07:58:38 +0000
Subject: [PATCH 1/2] Enable gfx950 CI on release_v2.2_rocm branch

tests/pytorch/test_fused_optimizer.py: relax fp8 exp_avg tolerance on HIP gfx950

tests/pytorch/test_fusible_ops.py: skip fp8 quantized GEMM cases lacking hipBLASLt algos on gfx950

tests/pytorch/test_numerics.py: skip tests on gfx950 with missing hipBLASLt algo

transformer_engine/pytorch/triton_kernels/rmsnorm.py: guard pointer alignment hints based on runtime alignment in fwd/bwd
---
 tests/pytorch/test_fused_optimizer.py         |  5 ++
 tests/pytorch/test_fusible_ops.py             | 12 +++++
 tests/pytorch/test_numerics.py                |  7 +++
 .../pytorch/triton_kernels/rmsnorm.py         | 54 ++++++++++++++-----
 4 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/tests/pytorch/test_fused_optimizer.py b/tests/pytorch/test_fused_optimizer.py
index 507fd3f35..fce58ab5e 100644
--- a/tests/pytorch/test_fused_optimizer.py
+++ b/tests/pytorch/test_fused_optimizer.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 from torch import nn
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 from torch.testing._internal.common_device_type import largeTensorTest
 import transformer_engine.pytorch as te
 from transformer_engine.common.recipe import DelayedScaling
@@ -16,6 +17,7 @@
 from transformer_engine.pytorch import fp8_model_init
 from transformer_engine.pytorch.utils import is_bf16_compatible
 from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
+from transformer_engine.pytorch.utils import get_device_compute_capability
 
 # Check if FP8 is supported
 fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
@@ -363,6 +365,7 @@ def test_fp16_exp_avg(self):
     @pytest.mark.skipif(not is_bf16_compatible(), reason="bf16 if not supported")
     @pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
     def test_fp8_exp_avg(self):
+        model_tol = 3e-2 if IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) else None
         self.gen_precision_aware_test(
             use_fp8_params=False,
             param_dtype=torch.bfloat16,
@@ -373,6 +376,8 @@ def test_fp8_exp_avg(self):
             exp_avg_sq_dtype=torch.float32,
             master_rtol=1e-2,
             master_atol=1e-2,
+            model_rtol=model_tol,
+            model_atol=model_tol,
         )
 
     @pytest.mark.skipif(not is_bf16_compatible(), reason="bf16 if not supported")
diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py
index 6f100ccd4..480b607c3 100644
--- a/tests/pytorch/test_fusible_ops.py
+++ b/tests/pytorch/test_fusible_ops.py
@@ -27,7 +27,9 @@
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer
 from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Tensor, MXFP8Quantizer
 from transformer_engine.pytorch.utils import is_bf16_compatible
+from transformer_engine.pytorch.utils import get_device_compute_capability
 import transformer_engine_torch as tex
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 
 # Check if FP8 is supported
 fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
@@ -900,6 +902,16 @@ def test_basic_linear_quantized(
         quantized_grad_input: bool,
     ) -> None:
         """GEMM with FP8 inputs and outputs"""
+        if IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5):
+            if (
+                quantization
+                and quantization.startswith("fp8")
+                and quantized_compute
+                and (quantized_grad_input or quantized_output)
+            ):
+                pytest.skip(
+                    "hipBLASLt does not provide suitable algorithms on gfx950 for this config."
+                )
         self._test_basic_linear(
             dtype=torch.bfloat16,
             quantization=quantization,
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
index f0480eac7..4ffd2f224 100644
--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -726,6 +726,13 @@ def test_gpt_full_activation_recompute(
         use_cast_transpose_triton =  bool( int(os.environ.get('NVTE_USE_CAST_TRANSPOSE_TRITON', '0')) )
         if fp8 and recipe.float8_current_scaling() and use_cast_transpose_triton:
             pytest.skip("Float8 Current Scaling unsupported for full recompute.")
+    if IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5):
+        if (dtype == torch.bfloat16 
+            and not fp8
+            and not use_reentrant 
+            and recipe.float8_per_tensor_scaling() 
+            ):
+            pytest.skip("hipBLASLt does not provide suitable algorithms on MI350 for this config.")
 
     config = model_configs[model]
     torch.compiler.reset() # avoid cache size limit overflow
diff --git a/transformer_engine/pytorch/triton_kernels/rmsnorm.py b/transformer_engine/pytorch/triton_kernels/rmsnorm.py
index 5dad61b89..4cf1790dc 100644
--- a/transformer_engine/pytorch/triton_kernels/rmsnorm.py
+++ b/transformer_engine/pytorch/triton_kernels/rmsnorm.py
@@ -49,6 +49,8 @@ def _rmsnorm_fwd_triton(
     IS_FP8: tl.constexpr,
     FP8_MAX: tl.constexpr,
     MAKE_TRANSPOSE: tl.constexpr,
+    INPUT_ALIGNED_16: tl.constexpr,
+    OUTPUT_ALIGNED_16: tl.constexpr,
 ):
     row_start = tl.program_id(0)
     col_offsets = tl.arange(0, BLOCK_SIZE)
@@ -77,7 +79,8 @@ def _rmsnorm_fwd_triton(
             for blk_idx in tl.range(0, n_cols_blks, num_stages=2):
                 cols = blk_idx * BLOCK_SIZE + col_offsets
                 input_ptrs = row_input_ptr + cols
-                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+                if INPUT_ALIGNED_16:
+                    input_ptrs = tl.multiple_of(input_ptrs, (16, ))
                 x = tl.load(input_ptrs).to(tl.float32)
                 sum_squares += tl.sum(x * x, axis=0)
 
@@ -85,7 +88,8 @@ def _rmsnorm_fwd_triton(
             cols = n_cols_blks * BLOCK_SIZE + col_offsets
             mask = cols < n_cols
             input_ptrs = row_input_ptr + cols
-            input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+            if INPUT_ALIGNED_16:
+                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
             x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to(tl.float32)
             sum_squares += tl.sum(x * x, axis=0)
 
@@ -100,7 +104,8 @@ def _rmsnorm_fwd_triton(
             for blk_idx in tl.range(0, n_cols_blks, num_stages=2):
                 cols = blk_idx * BLOCK_SIZE + col_offsets
                 input_ptrs = row_input_ptr + cols
-                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+                if INPUT_ALIGNED_16:
+                    input_ptrs = tl.multiple_of(input_ptrs, (16, ))
                 x = tl.load(input_ptrs).to(tl.float32)
                 g_ptrs = g_ptr + cols
                 g = tl.load(g_ptrs).to(tl.float32)
@@ -143,7 +148,8 @@ def _rmsnorm_fwd_triton(
         mask = col_offsets < n_cols
         for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=2):
             input_ptrs = input_ptr + row_idx * input_row_stride + col_offsets
-            input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+            if INPUT_ALIGNED_16:
+                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
             row = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to(tl.float32)
             g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
             row_norm = row * row
@@ -159,7 +165,8 @@ def _rmsnorm_fwd_triton(
             rms_norm = row * norm_factor * g
 
             output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets
-            output_ptrs = tl.multiple_of(output_ptrs, (16, ))
+            if OUTPUT_ALIGNED_16:
+                output_ptrs = tl.multiple_of(output_ptrs, (16, ))
             if IS_FP8:
                 amax_temp = tl.max(tl.abs(rms_norm), axis=-1)
                 amax = tl.maximum(amax, amax_temp)
@@ -180,7 +187,9 @@ def _rmsnorm_fwd_triton(
 @triton.jit
 def _rmsnorm_bwd_triton(grad_output_ptr, input_ptr, g_ptr, rsigma_ptr, dx_ptr, dg_ptr, input_row_stride, output_row_stride,
                         n_rows, n_cols, ZERO_CENTERED_GAMMA: tl.constexpr, BLOCK_SIZE: tl.constexpr,
-                        USE_BLOCKED: tl.constexpr, NUM_PRGMS: tl.constexpr):
+                        USE_BLOCKED: tl.constexpr, NUM_PRGMS: tl.constexpr,
+                        INPUT_ALIGNED_16: tl.constexpr, GRAD_OUTPUT_ALIGNED_16: tl.constexpr,
+                        DX_ALIGNED_16: tl.constexpr):
     row_start = tl.program_id(0)
     col_offsets = tl.arange(0, BLOCK_SIZE)
     #   tl.assume(input_row_stride >= 0)
@@ -205,8 +214,10 @@ def _rmsnorm_bwd_triton(grad_output_ptr, input_ptr, g_ptr, rsigma_ptr, dx_ptr, d
                 input_ptrs = row_input_ptr + cols
                 grad_output_ptrs = row_grad_output_ptr + cols
 
-                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
-                grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, ))
+                if INPUT_ALIGNED_16:
+                    input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+                if GRAD_OUTPUT_ALIGNED_16:
+                    grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, ))
 
                 x = tl.load(input_ptrs).to(tl.float32)
                 grad_output = tl.load(grad_output_ptrs).to(tl.float32)
@@ -237,8 +248,10 @@ def _rmsnorm_bwd_triton(grad_output_ptr, input_ptr, g_ptr, rsigma_ptr, dx_ptr, d
                 input_ptrs = row_input_ptr + cols
                 grad_output_ptrs = row_grad_output_ptr + cols
 
-                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
-                grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, ))
+                if INPUT_ALIGNED_16:
+                    input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+                if GRAD_OUTPUT_ALIGNED_16:
+                    grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, ))
 
                 x = tl.load(input_ptrs).to(tl.float32)
                 grad_output = tl.load(grad_output_ptrs).to(tl.float32)
@@ -288,9 +301,12 @@ def _rmsnorm_bwd_triton(grad_output_ptr, input_ptr, g_ptr, rsigma_ptr, dx_ptr, d
             grad_output_ptrs = grad_output_ptr + row_idx * output_row_stride + col_offsets
             dx_ptrs = dx_ptr + row_idx * input_row_stride + col_offsets
 
-            input_ptrs = tl.multiple_of(input_ptrs, (16, ))
-            grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, ))
-            dx_ptrs = tl.multiple_of(dx_ptrs, (16, ))
+            if INPUT_ALIGNED_16:
+                input_ptrs = tl.multiple_of(input_ptrs, (16, ))
+            if GRAD_OUTPUT_ALIGNED_16:
+                grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, ))
+            if DX_ALIGNED_16:
+                dx_ptrs = tl.multiple_of(dx_ptrs, (16, ))
 
             x = tl.load(input_ptrs, mask=mask, other=0.0).to(tl.float32)
             grad_output = tl.load(grad_output_ptrs, mask=mask, other=0.0).to(tl.float32)
@@ -348,9 +364,12 @@ def te_rmsnorm_bwd_triton(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma):
     dg_tmp = torch.empty(dg_tmp_rows(x_, sm_margin), N, device='cuda', dtype=torch.float32, requires_grad=False) if need_reduction else None
 
     grid_bwd = lambda meta: (NUM_PRGMS, )
+    input_aligned_16 = (x_.data_ptr() % 16 == 0) and (x_.stride(-1) % 16 == 0)
+    grad_output_aligned_16 = (dz_.data_ptr() % 16 == 0) and (dz_.stride(-1) % 16 == 0)
+    dx_aligned_16 = (dx.data_ptr() % 16 == 0) and (dx.stride(-1) % 16 == 0)
     _rmsnorm_bwd_triton[grid_bwd](dz_, x_, gamma_, rsigma_, dx, dg_tmp if need_reduction else dgamma,
                                   x_.stride(0), dz_.stride(0), M, N, zero_centered_gamma, blk_size,
-                                  USE_BLOCKED, NUM_PRGMS, num_warps=8)
+                                  USE_BLOCKED, NUM_PRGMS, input_aligned_16, grad_output_aligned_16, dx_aligned_16, num_warps=8)
 
     if need_reduction:
         grid_reduce = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])]
@@ -440,6 +459,11 @@ def te_rmsnorm_fwd_triton(
 
 
     grid_fwd = lambda meta: (NUM_PRGMS, )
+    input_aligned_16 = (input.data_ptr() % 16 == 0) and (input.stride(-1) % 16 == 0)
+    out_alignment_tensor = out._data if hasattr(out, "_data") else out
+    output_aligned_16 = (out_alignment_tensor.data_ptr() % 16 == 0) and (
+        out_alignment_tensor.stride(-1) % 16 == 0
+    )
     # TODO(micky774) Implement fused MXFP8 quantization within the kernel
     _rmsnorm_fwd_triton[grid_fwd](
         out_ptr,
@@ -462,6 +486,8 @@ def te_rmsnorm_fwd_triton(
         IS_FP8,
         FP8_MAX,
         MAKE_TRANSPOSE,
+        input_aligned_16,
+        output_aligned_16,
     )
     if IS_MXFP8:
         out = quantizer.quantize(out)

From 20af9e49e7e1afe0f1bb496ee14574e6f3fb2020 Mon Sep 17 00:00:00 2001
From: Veera Rajasekhar Reddy Gopu <veerarajasekharreddy.gopu@amd.com>
Date: Fri, 16 Jan 2026 08:11:32 +0000
Subject: [PATCH 2/2] CI: Add Github Actions support to release_v2.2_rocm

---
 .github/workflows/rocm-ci.yml | 437 ++++++++++++++++++++++++++++++++++
 ci/ci_config.json             |   7 +
 2 files changed, 444 insertions(+)
 create mode 100644 .github/workflows/rocm-ci.yml
 create mode 100644 ci/ci_config.json

diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml
new file mode 100644
index 000000000..ec5010054
--- /dev/null
+++ b/.github/workflows/rocm-ci.yml
@@ -0,0 +1,437 @@
+# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+
+name: TransformerEngine CI
+
+on:
+  push:
+    branches:
+      - 'dev'
+      - 'release_v1.*_rocm'
+      - 'release_v2.*_rocm'
+  pull_request:
+    branches:
+      - 'dev'
+      - 'release_v1.**_rocm'
+      - 'release_v2.**_rocm'
+  workflow_dispatch:
+    inputs:
+      test_level:
+        description: 'Test Level (1-3)'
+        required: true
+        default: '1'
+      skip_dev_merge:
+        description: 'Skip merging dev branch'
+        type: boolean
+        default: false
+      docker_image_override:
+        description: 'Manual Docker Image (Leave empty to use config file value)'
+        required: false
+        type: string
+      test_config_from_source:
+        description: 'DEBUG: Use config.json from current source branch instead of dev'
+        type: boolean
+        default: false
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_and_test:
+    name: Build and Test on GPU (${{ matrix.runner }})
+    timeout-minutes: 720
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-8, linux-mi355-8]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+          fetch-depth: 0
+        
+      - name: Host Diagnostics & Environment Setup
+        id: host-setup
+        run: |
+          # Host Activity Checks
+          echo "::group::Host Diagnostics"
+
+          echo ">>> Active Containers:"
+          docker ps -a
+
+          echo ">>> ROCm Installation:"
+          ls -d /opt/rocm* || echo "No /opt/rocm found"
+          echo ">>> GPU info:"
+          ls -l /dev/dri
+          ls -l /dev/kfd
+          rocm-smi
+
+          echo ">>> Kernel Command Line:"
+          cat /proc/cmdline
+          echo "::endgroup::"
+
+          # Calculate Test Level
+          # Default to input (or '1' if input is missing/null)
+          CALC_LEVEL="${{ inputs.test_level || '1' }}"
+
+          # COnly force Level 3 if this is a direct PUSH to dev or a release branch
+          if [[ "${{ github.event_name }}" == "push" ]]; then
+             if [[ "${{ github.ref_name }}" == "dev" || "${{ github.ref_name }}" =~ ^release_v.*_rocm$ ]]; then
+                echo "::notice::Push to monitored branch (${{ github.ref_name }}) detected. Forcing Level 3."
+                CALC_LEVEL="3"
+             fi
+          fi
+          
+          echo "TEST_LEVEL=$CALC_LEVEL" >> $GITHUB_ENV
+
+          # Print Final Environment
+          echo "::group::Environment & Parameters"
+          echo "Final Test Level: $CALC_LEVEL"
+          echo "Event Name: ${{ github.event_name }}"
+          echo "Ref Name: ${{ github.ref_name }}"
+          echo "Base Ref: ${{ github.base_ref }}"
+          env | sort
+          echo "::endgroup::"
+
+      - name: Select Docker Image Tag
+        id: select-image
+        run: |
+          # Determine config source
+          # Default we are fetching from 'dev' branch
+          CONFIG_BRANCH="dev"
+          
+          # If manual run requesting source config, switch branch
+          if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then
+             CONFIG_BRANCH="${{ github.ref_name }}"
+             echo "::notice::Debugging mode: Fetching config from current branch ($CONFIG_BRANCH)"
+          fi
+
+          # Download config
+          CONFIG_URL="https://raw.githubusercontent.com/ROCm/TransformerEngine/${CONFIG_BRANCH}/ci/ci_config.json"
+          echo "Attempting to fetch image config from: $CONFIG_URL"
+          
+          if curl -s -f -o docker_config.json "$CONFIG_URL"; then
+             echo "Successfully downloaded config from $CONFIG_BRANCH."
+          else
+             echo "::warning::Failed to fetch config from $CONFIG_BRANCH (File might not exist yet)."
+             
+             # Fallback: Check source branch file
+             if [[ -f "ci/ci_config.json" ]]; then
+                echo "::notice::Falling back to local 'ci/ci_config.json' from checkout."
+                cp ci/ci_config.json docker_config.json
+             else
+                echo "::error::Config file not found in $CONFIG_BRANCH OR locally."
+                exit 1
+             fi
+          fi
+
+          # Determine image key
+          BRANCH_NAME="${{ github.base_ref || github.ref_name }}"
+          echo "Determining image for branch: $BRANCH_NAME"
+          
+          # Logic: Check if branch matches "release_vX.X". 
+          # If so, look for that key in JSON. Otherwise default.
+          JSON_KEY="default"
+          
+          if [[ $BRANCH_NAME =~ ^release_v([0-9]+\.[0-9]+)_rocm$ ]]; then
+            VERSION_KEY="release_v${BASH_REMATCH[1]}"
+            # Check if this specific version key exists in the JSON
+            if [[ $(jq "(.docker_images | has(\"$VERSION_KEY\"))" docker_config.json) == "true" ]]; then
+               JSON_KEY="$VERSION_KEY"
+            fi
+          fi
+          
+          echo "Selected config key: $JSON_KEY"
+
+          # Extract image name from json
+          IMAGE_TO_USE=$(jq -r ".docker_images.\"$JSON_KEY\"" docker_config.json)
+
+          # Check input from workflow_dispatch overriding the image
+          MANUAL_OVERRIDE="${{ inputs.docker_image_override }}"
+          if [[ -n "$MANUAL_OVERRIDE" ]]; then
+            echo "::notice::Manual override detected: $MANUAL_OVERRIDE"
+            IMAGE_TO_USE="$MANUAL_OVERRIDE"
+          fi
+          
+          echo "Selected image: $IMAGE_TO_USE"
+          echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT
+
+      - name: Pull Docker Image
+        run: |
+          docker pull ${{ steps.select-image.outputs.image-tag }}
+
+      - name: Run Container
+        run: |
+          docker run -dt \
+            --name te-runner \
+            --network=host \
+            --device=/dev/dri --device=/dev/kfd \
+            --shm-size=16G \
+            --pid=host \
+            --group-add $(getent group render | cut -d: -f3) \
+            --group-add $(getent group video | cut -d: -f3) \
+            -v "${{ github.workspace }}:/workspace" \
+            -w /workspace \
+            ${{ steps.select-image.outputs.image-tag}}
+
+      - name: Container Diagnostics & GPU Setup
+        id: container-diag
+        run: |
+          echo "::group::Container Configuration"
+          # Check Shared Memory Size inside container
+          echo ">>> /dev/shm size:"
+          docker exec te-runner df -h /dev/shm
+          
+          # Check OS/Kernel inside container
+          echo ">>> Container OS:"
+          docker exec te-runner cat /etc/os-release | grep PRETTY_NAME
+          echo "::endgroup::"
+
+          echo "::group::ROCm Diagnostics (Host vs Container)"
+          echo ">>> CONTAINER rocm-smi:"
+          docker exec te-runner rocm-smi || true
+          echo "::endgroup::"
+
+          # Determine Architecture
+          # Run rocminfo inside the container and capture the output
+          ARCH=$(docker exec te-runner bash -c "rocminfo | grep -m 1 -oP 'gfx[0-9a-fA-F]+'")
+          
+          if [ -z "$ARCH" ]; then
+            echo "::error::Could not determine GPU architecture using rocminfo inside the container."
+            docker exec te-runner rocminfo
+            exit 1
+          fi
+          
+          echo "Detected GPU Arch: $ARCH"
+          echo "arch=$ARCH" >> $GITHUB_OUTPUT
+
+      - name: Build Project
+        run: |
+          docker exec \
+            -e GPU_ARCH=${{ steps.container-diag.outputs.arch }} \
+            te-runner bash -c "$(cat <<'EOF'
+          set -ex
+          
+          export HIP_PATH=""
+          export PYTORCH_ROCM_ARCH=$GPU_ARCH
+          export NVTE_ROCM_ARCH=$GPU_ARCH
+          export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts
+          pip install ninja
+          git config --global --add safe.directory '*'
+          pip install --no-build-isolation -v . 2>&1
+          EOF
+          )"
+
+      - name: Run sGPU tests
+        id: sgpu-tests
+        continue-on-error: true
+        run: |
+          # Cleanup previous failure markers if any. Don't actually do anything on k8s pods
+          rm -f FAIL_*
+
+          docker exec \
+            -e TEST_SGPU=1 \
+            -e TEST_LEVEL=${{ env.TEST_LEVEL }} \
+            te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -x -o pipefail
+          ulimit -c 0 # Disable core dumps
+
+          # debug output
+          ls -d /opt/rocm*
+          python --version
+          pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext"
+
+          HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 &
+          torch_pid=$!; echo Pytorch test pid $!
+          
+          HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 &
+          jax_pid=$!; echo JAX test pid $!
+          
+          HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 &
+          core_pid=$!; echo Core test pid $!
+          
+          wait $core_pid; core_rc=$?
+          wait $jax_pid; jax_rc=$?
+          wait $torch_pid; torch_rc=$?
+          
+          # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
+          # Check PyTorch
+          if [ $torch_rc -ne 0 ]; then 
+            echo "::group::[FAILED] PyTorch sGPU Log"
+            cat /workspace/torch_sgpu.log
+            echo "::endgroup::"
+            echo "::error::Pytorch sGPU test FAILED."
+            touch /workspace/FAIL_TORCH_SGPU
+          fi
+
+          # Check JAX
+          if [ $jax_rc -ne 0 ]; then 
+            echo "::group::[FAILED] JAX sGPU Log"
+            cat /workspace/jax_sgpu.log
+            echo "::endgroup::"
+            echo "::error::JAX sGPU test FAILED."
+            touch /workspace/FAIL_JAX_SGPU
+          fi
+
+          # Check Core
+          if [ $core_rc -ne 0 ]; then 
+            echo "::group::[FAILED] Core sGPU Log"
+            cat /workspace/core_sgpu.log
+            echo "::endgroup::"
+            echo "::error::Core sGPU test FAILED."
+            touch /workspace/FAIL_CORE_SGPU
+          fi
+          
+          test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0
+          EOF
+          )"
+          
+          # Export failed tests statuses to host runner
+          if [ -f FAIL_TORCH_SGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
+          if [ -f FAIL_JAX_SGPU ];   then echo "jax=fail"   >> $GITHUB_OUTPUT; fi
+          if [ -f FAIL_CORE_SGPU ];  then echo "core=fail"  >> $GITHUB_OUTPUT; fi
+
+      - name: Run mGPU tests
+        id: mgpu-tests
+        continue-on-error: true
+        run: |
+          docker exec \
+            -e TEST_MGPU=1 \
+            -e TEST_LEVEL=${{ env.TEST_LEVEL }} \
+            te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -x -o pipefail
+          ulimit -c 0 # Disable core dumps
+          
+          # Run PyTorch
+          ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1
+          torch_rc=$?
+          
+          # Run JAX
+          ci/jax.sh > /workspace/jax_mgpu.log 2>&1
+          jax_rc=$?
+          
+          # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
+          if [ $torch_rc -ne 0 ]; then 
+            echo "::group::[FAILED] PyTorch mGPU Log"
+            cat /workspace/torch_mgpu.log
+            echo "::endgroup::"
+            echo "::error::Pytorch mGPU test FAILED."
+            touch /workspace/FAIL_TORCH_MGPU
+          fi
+
+          if [ $jax_rc -ne 0 ]; then 
+            echo "::group::[FAILED] JAX mGPU Log"
+            cat /workspace/jax_mgpu.log
+            echo "::endgroup::"
+            echo "::error::JAX mGPU test FAILED."
+            touch /workspace/FAIL_JAX_MGPU
+          fi
+          
+          test $torch_rc -eq 0 -a $jax_rc -eq 0
+          EOF
+          )"
+
+          # Export failed tests statuses to host runner
+          if [ -f FAIL_TORCH_MGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
+          if [ -f FAIL_JAX_MGPU ];   then echo "jax=fail"   >> $GITHUB_OUTPUT; fi
+
+      - name: Run Examples
+        id: examples-tests
+        continue-on-error: true
+        run: |
+          docker exec te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -ex -o pipefail
+          ulimit -c 0 # Disable core dumps
+
+          cd /workspace/examples/pytorch/mnist
+          python main.py 2>&1 | tee /workspace/examples.log
+          python main.py --use-te 2>&1 | tee -a /workspace/examples.log
+          python main.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+          
+          cd /workspace/examples/jax/mnist
+          pip3 install -r requirements.txt
+          python test_single_gpu_mnist.py 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_mnist.py --use-te 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_mnist.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+          
+          cd /workspace/examples/jax/encoder
+          pip3 install -r requirements.txt
+          python test_single_gpu_encoder.py 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_encoder.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+          EOF
+          )"
+
+      - name: Check Test Failure Status
+        if: always()
+        run: |
+          EXIT_STATUS=0
+          # Check outcomes of the specific test steps
+          # "outcome" will be 'failure' even if continue-on-error was true
+
+          # sGPU CHECKS
+          # We check for the file existence directly because the 'Run sGPU tests' step 
+          # halts immediately on docker failure, skipping the lines that set step outputs.
+          if [[ -f FAIL_CORE_SGPU ]]; then
+            echo "::error::Core sGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+          if [[ -f FAIL_TORCH_SGPU ]]; then
+            echo "::error::PyTorch sGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+          if [[ -f FAIL_JAX_SGPU ]]; then
+            echo "::error::JAX sGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+          
+          # mGPU CHECKS
+          if [[ -f FAIL_TORCH_MGPU ]]; then
+            echo "::error::PyTorch mGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+          if [[ -f FAIL_JAX_MGPU ]]; then
+            echo "::error::JAX mGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+
+          # EXAMPLES CHECK
+          # Examples script does not use marker files, so we rely on step outcome
+          if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then
+            echo "::error::Example Tests Failed."
+            EXIT_STATUS=1
+          fi
+
+          # Fail the job if any errors were detected
+          if [[ "$EXIT_STATUS" == "1" ]]; then
+            exit 1
+          fi
+
+      - name: Copy logs and reports from container
+        if: always()
+        run: |
+          docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log || true
+          docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log || true
+          docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log || true
+          docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log || true
+          docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log || true
+
+      - name: Upload logs and test reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: logs-and-reports-${{ matrix.runner }}
+          path: |
+            *.log
+          if-no-files-found: ignore
+          retention-days: 5
+
+      - name: Cleanup container
+        if: always()
+        run: docker rm -f te-runner || true
\ No newline at end of file
diff --git a/ci/ci_config.json b/ci/ci_config.json
new file mode 100644
index 000000000..f87079608
--- /dev/null
+++ b/ci/ci_config.json
@@ -0,0 +1,7 @@
+{
+    "docker_images": {
+      "default": "registry-sc-harbor.amd.com/framework/te-ci:rocm-7.1.1_ubuntu22.04_py3.11_pytorch_release_2.8_63e525b2_jax_0.7.1_fa-2.8.0",
+      "release_v1.13": "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273",
+      "release_v1.14": "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273"
+    }
+  }
\ No newline at end of file