From e62ad401b2b280440a0487f4dcd7c59f7de51565 Mon Sep 17 00:00:00 2001 From: Veera Rajasekhar Reddy Gopu Date: Fri, 16 Jan 2026 07:58:38 +0000 Subject: [PATCH 1/2] Enable gfx950 CI on release_v2.2_rocm branch tests/pytorch/test_fused_optimizer.py: relax fp8 exp_avg tolerance on HIP gfx950 tests/pytorch/test_fusible_ops.py: skip fp8 quantized GEMM cases lacking hipBLASLt algos on gfx950 tests/pytorch/test_numerics.py: skip tests on gfx950 with missing hipBLASLt algo transformer_engine/pytorch/triton_kernels/rmsnorm.py: guard pointer alignment hints based on runtime alignment in fwd/bwd --- tests/pytorch/test_fused_optimizer.py | 5 ++ tests/pytorch/test_fusible_ops.py | 12 +++++ tests/pytorch/test_numerics.py | 7 +++ .../pytorch/triton_kernels/rmsnorm.py | 54 ++++++++++++++----- 4 files changed, 64 insertions(+), 14 deletions(-) diff --git a/tests/pytorch/test_fused_optimizer.py b/tests/pytorch/test_fused_optimizer.py index 507fd3f35..fce58ab5e 100644 --- a/tests/pytorch/test_fused_optimizer.py +++ b/tests/pytorch/test_fused_optimizer.py @@ -9,6 +9,7 @@ import pytest import torch from torch import nn +from torch.utils.cpp_extension import IS_HIP_EXTENSION from torch.testing._internal.common_device_type import largeTensorTest import transformer_engine.pytorch as te from transformer_engine.common.recipe import DelayedScaling @@ -16,6 +17,7 @@ from transformer_engine.pytorch import fp8_model_init from transformer_engine.pytorch.utils import is_bf16_compatible from transformer_engine.pytorch.fp8 import FP8GlobalStateManager +from transformer_engine.pytorch.utils import get_device_compute_capability # Check if FP8 is supported fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available() @@ -363,6 +365,7 @@ def test_fp16_exp_avg(self): @pytest.mark.skipif(not is_bf16_compatible(), reason="bf16 if not supported") @pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8) def test_fp8_exp_avg(self): + model_tol = 3e-2 if IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) else None self.gen_precision_aware_test( use_fp8_params=False, param_dtype=torch.bfloat16, @@ -373,6 +376,8 @@ def test_fp8_exp_avg(self): exp_avg_sq_dtype=torch.float32, master_rtol=1e-2, master_atol=1e-2, + model_rtol=model_tol, + model_atol=model_tol, ) @pytest.mark.skipif(not is_bf16_compatible(), reason="bf16 if not supported") diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py index 6f100ccd4..480b607c3 100644 --- a/tests/pytorch/test_fusible_ops.py +++ b/tests/pytorch/test_fusible_ops.py @@ -27,7 +27,9 @@ from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer from transformer_engine.pytorch.tensor.mxfp8_tensor import MXFP8Tensor, MXFP8Quantizer from transformer_engine.pytorch.utils import is_bf16_compatible +from transformer_engine.pytorch.utils import get_device_compute_capability import transformer_engine_torch as tex +from torch.utils.cpp_extension import IS_HIP_EXTENSION # Check if FP8 is supported fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available() @@ -900,6 +902,16 @@ def test_basic_linear_quantized( quantized_grad_input: bool, ) -> None: """GEMM with FP8 inputs and outputs""" + if IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5): + if ( + quantization + and quantization.startswith("fp8") + and quantized_compute + and (quantized_grad_input or quantized_output) + ): + pytest.skip( + "hipBLASLt does not provide suitable algorithms on gfx950 for this config." + ) self._test_basic_linear( dtype=torch.bfloat16, quantization=quantization, diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py index f0480eac7..4ffd2f224 100644 --- a/tests/pytorch/test_numerics.py +++ b/tests/pytorch/test_numerics.py @@ -726,6 +726,13 @@ def test_gpt_full_activation_recompute( use_cast_transpose_triton = bool( int(os.environ.get('NVTE_USE_CAST_TRANSPOSE_TRITON', '0')) ) if fp8 and recipe.float8_current_scaling() and use_cast_transpose_triton: pytest.skip("Float8 Current Scaling unsupported for full recompute.") + if IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5): + if (dtype == torch.bfloat16 + and not fp8 + and not use_reentrant + and recipe.float8_per_tensor_scaling() + ): + pytest.skip("hipBLASLt does not provide suitable algorithms on MI350 for this config.") config = model_configs[model] torch.compiler.reset() # avoid cache size limit overflow diff --git a/transformer_engine/pytorch/triton_kernels/rmsnorm.py b/transformer_engine/pytorch/triton_kernels/rmsnorm.py index 5dad61b89..4cf1790dc 100644 --- a/transformer_engine/pytorch/triton_kernels/rmsnorm.py +++ b/transformer_engine/pytorch/triton_kernels/rmsnorm.py @@ -49,6 +49,8 @@ def _rmsnorm_fwd_triton( IS_FP8: tl.constexpr, FP8_MAX: tl.constexpr, MAKE_TRANSPOSE: tl.constexpr, + INPUT_ALIGNED_16: tl.constexpr, + OUTPUT_ALIGNED_16: tl.constexpr, ): row_start = tl.program_id(0) col_offsets = tl.arange(0, BLOCK_SIZE) @@ -77,7 +79,8 @@ def _rmsnorm_fwd_triton( for blk_idx in tl.range(0, n_cols_blks, num_stages=2): cols = blk_idx * BLOCK_SIZE + col_offsets input_ptrs = row_input_ptr + cols - input_ptrs = tl.multiple_of(input_ptrs, (16, )) + if INPUT_ALIGNED_16: + input_ptrs = tl.multiple_of(input_ptrs, (16, )) x = tl.load(input_ptrs).to(tl.float32) sum_squares += tl.sum(x * x, axis=0) @@ -85,7 +88,8 @@ def _rmsnorm_fwd_triton( cols = n_cols_blks * BLOCK_SIZE + col_offsets mask = cols < n_cols input_ptrs = row_input_ptr + cols - input_ptrs = tl.multiple_of(input_ptrs, (16, )) + if INPUT_ALIGNED_16: + input_ptrs = tl.multiple_of(input_ptrs, (16, )) x = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to(tl.float32) sum_squares += tl.sum(x * x, axis=0) @@ -100,7 +104,8 @@ def _rmsnorm_fwd_triton( for blk_idx in tl.range(0, n_cols_blks, num_stages=2): cols = blk_idx * BLOCK_SIZE + col_offsets input_ptrs = row_input_ptr + cols - input_ptrs = tl.multiple_of(input_ptrs, (16, )) + if INPUT_ALIGNED_16: + input_ptrs = tl.multiple_of(input_ptrs, (16, )) x = tl.load(input_ptrs).to(tl.float32) g_ptrs = g_ptr + cols g = tl.load(g_ptrs).to(tl.float32) @@ -143,7 +148,8 @@ def _rmsnorm_fwd_triton( mask = col_offsets < n_cols for row_idx in tl.range(row_start, n_rows, NUM_PRGMS, num_stages=2): input_ptrs = input_ptr + row_idx * input_row_stride + col_offsets - input_ptrs = tl.multiple_of(input_ptrs, (16, )) + if INPUT_ALIGNED_16: + input_ptrs = tl.multiple_of(input_ptrs, (16, )) row = tl.load(input_ptrs, mask=mask, other=0.0, cache_modifier=".cg").to(tl.float32) g = tl.load(g_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32) row_norm = row * row @@ -159,7 +165,8 @@ def _rmsnorm_fwd_triton( rms_norm = row * norm_factor * g output_ptrs = output_ptr + row_idx * output_row_stride + col_offsets - output_ptrs = tl.multiple_of(output_ptrs, (16, )) + if OUTPUT_ALIGNED_16: + output_ptrs = tl.multiple_of(output_ptrs, (16, )) if IS_FP8: amax_temp = tl.max(tl.abs(rms_norm), axis=-1) amax = tl.maximum(amax, amax_temp) @@ -180,7 +187,9 @@ def _rmsnorm_fwd_triton( @triton.jit def _rmsnorm_bwd_triton(grad_output_ptr, input_ptr, g_ptr, rsigma_ptr, dx_ptr, dg_ptr, input_row_stride, output_row_stride, n_rows, n_cols, ZERO_CENTERED_GAMMA: tl.constexpr, BLOCK_SIZE: tl.constexpr, - USE_BLOCKED: tl.constexpr, NUM_PRGMS: tl.constexpr): + USE_BLOCKED: tl.constexpr, NUM_PRGMS: tl.constexpr, + INPUT_ALIGNED_16: tl.constexpr, GRAD_OUTPUT_ALIGNED_16: tl.constexpr, + DX_ALIGNED_16: tl.constexpr): row_start = tl.program_id(0) col_offsets = tl.arange(0, BLOCK_SIZE) # tl.assume(input_row_stride >= 0) @@ -205,8 +214,10 @@ def _rmsnorm_bwd_triton(grad_output_ptr, input_ptr, g_ptr, rsigma_ptr, dx_ptr, d input_ptrs = row_input_ptr + cols grad_output_ptrs = row_grad_output_ptr + cols - input_ptrs = tl.multiple_of(input_ptrs, (16, )) - grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, )) + if INPUT_ALIGNED_16: + input_ptrs = tl.multiple_of(input_ptrs, (16, )) + if GRAD_OUTPUT_ALIGNED_16: + grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, )) x = tl.load(input_ptrs).to(tl.float32) grad_output = tl.load(grad_output_ptrs).to(tl.float32) @@ -237,8 +248,10 @@ def _rmsnorm_bwd_triton(grad_output_ptr, input_ptr, g_ptr, rsigma_ptr, dx_ptr, d input_ptrs = row_input_ptr + cols grad_output_ptrs = row_grad_output_ptr + cols - input_ptrs = tl.multiple_of(input_ptrs, (16, )) - grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, )) + if INPUT_ALIGNED_16: + input_ptrs = tl.multiple_of(input_ptrs, (16, )) + if GRAD_OUTPUT_ALIGNED_16: + grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, )) x = tl.load(input_ptrs).to(tl.float32) grad_output = tl.load(grad_output_ptrs).to(tl.float32) @@ -288,9 +301,12 @@ def _rmsnorm_bwd_triton(grad_output_ptr, input_ptr, g_ptr, rsigma_ptr, dx_ptr, d grad_output_ptrs = grad_output_ptr + row_idx * output_row_stride + col_offsets dx_ptrs = dx_ptr + row_idx * input_row_stride + col_offsets - input_ptrs = tl.multiple_of(input_ptrs, (16, )) - grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, )) - dx_ptrs = tl.multiple_of(dx_ptrs, (16, )) + if INPUT_ALIGNED_16: + input_ptrs = tl.multiple_of(input_ptrs, (16, )) + if GRAD_OUTPUT_ALIGNED_16: + grad_output_ptrs = tl.multiple_of(grad_output_ptrs, (16, )) + if DX_ALIGNED_16: + dx_ptrs = tl.multiple_of(dx_ptrs, (16, )) x = tl.load(input_ptrs, mask=mask, other=0.0).to(tl.float32) grad_output = tl.load(grad_output_ptrs, mask=mask, other=0.0).to(tl.float32) @@ -348,9 +364,12 @@ def te_rmsnorm_bwd_triton(dz, x, rsigma, gamma, sm_margin, zero_centered_gamma): dg_tmp = torch.empty(dg_tmp_rows(x_, sm_margin), N, device='cuda', dtype=torch.float32, requires_grad=False) if need_reduction else None grid_bwd = lambda meta: (NUM_PRGMS, ) + input_aligned_16 = (x_.data_ptr() % 16 == 0) and (x_.stride(-1) % 16 == 0) + grad_output_aligned_16 = (dz_.data_ptr() % 16 == 0) and (dz_.stride(-1) % 16 == 0) + dx_aligned_16 = (dx.data_ptr() % 16 == 0) and (dx.stride(-1) % 16 == 0) _rmsnorm_bwd_triton[grid_bwd](dz_, x_, gamma_, rsigma_, dx, dg_tmp if need_reduction else dgamma, x_.stride(0), dz_.stride(0), M, N, zero_centered_gamma, blk_size, - USE_BLOCKED, NUM_PRGMS, num_warps=8) + USE_BLOCKED, NUM_PRGMS, input_aligned_16, grad_output_aligned_16, dx_aligned_16, num_warps=8) if need_reduction: grid_reduce = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])] @@ -440,6 +459,11 @@ def te_rmsnorm_fwd_triton( grid_fwd = lambda meta: (NUM_PRGMS, ) + input_aligned_16 = (input.data_ptr() % 16 == 0) and (input.stride(-1) % 16 == 0) + out_alignment_tensor = out._data if hasattr(out, "_data") else out + output_aligned_16 = (out_alignment_tensor.data_ptr() % 16 == 0) and ( + out_alignment_tensor.stride(-1) % 16 == 0 + ) # TODO(micky774) Implement fused MXFP8 quantization within the kernel _rmsnorm_fwd_triton[grid_fwd]( out_ptr, @@ -462,6 +486,8 @@ def te_rmsnorm_fwd_triton( IS_FP8, FP8_MAX, MAKE_TRANSPOSE, + input_aligned_16, + output_aligned_16, ) if IS_MXFP8: out = quantizer.quantize(out) From 20af9e49e7e1afe0f1bb496ee14574e6f3fb2020 Mon Sep 17 00:00:00 2001 From: Veera Rajasekhar Reddy Gopu Date: Fri, 16 Jan 2026 08:11:32 +0000 Subject: [PATCH 2/2] CI: Add Github Actions support to release_v2.2_rocm --- .github/workflows/rocm-ci.yml | 437 ++++++++++++++++++++++++++++++++++ ci/ci_config.json | 7 + 2 files changed, 444 insertions(+) create mode 100644 .github/workflows/rocm-ci.yml create mode 100644 ci/ci_config.json diff --git a/.github/workflows/rocm-ci.yml b/.github/workflows/rocm-ci.yml new file mode 100644 index 000000000..ec5010054 --- /dev/null +++ b/.github/workflows/rocm-ci.yml @@ -0,0 +1,437 @@ +# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. + +name: TransformerEngine CI + +on: + push: + branches: + - 'dev' + - 'release_v1.*_rocm' + - 'release_v2.*_rocm' + pull_request: + branches: + - 'dev' + - 'release_v1.**_rocm' + - 'release_v2.**_rocm' + workflow_dispatch: + inputs: + test_level: + description: 'Test Level (1-3)' + required: true + default: '1' + skip_dev_merge: + description: 'Skip merging dev branch' + type: boolean + default: false + docker_image_override: + description: 'Manual Docker Image (Leave empty to use config file value)' + required: false + type: string + test_config_from_source: + description: 'DEBUG: Use config.json from current source branch instead of dev' + type: boolean + default: false + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build_and_test: + name: Build and Test on GPU (${{ matrix.runner }}) + timeout-minutes: 720 + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-8, linux-mi355-8] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: 'recursive' + fetch-depth: 0 + + - name: Host Diagnostics & Environment Setup + id: host-setup + run: | + # Host Activity Checks + echo "::group::Host Diagnostics" + + echo ">>> Active Containers:" + docker ps -a + + echo ">>> ROCm Installation:" + ls -d /opt/rocm* || echo "No /opt/rocm found" + echo ">>> GPU info:" + ls -l /dev/dri + ls -l /dev/kfd + rocm-smi + + echo ">>> Kernel Command Line:" + cat /proc/cmdline + echo "::endgroup::" + + # Calculate Test Level + # Default to input (or '1' if input is missing/null) + CALC_LEVEL="${{ inputs.test_level || '1' }}" + + # COnly force Level 3 if this is a direct PUSH to dev or a release branch + if [[ "${{ github.event_name }}" == "push" ]]; then + if [[ "${{ github.ref_name }}" == "dev" || "${{ github.ref_name }}" =~ ^release_v.*_rocm$ ]]; then + echo "::notice::Push to monitored branch (${{ github.ref_name }}) detected. Forcing Level 3." + CALC_LEVEL="3" + fi + fi + + echo "TEST_LEVEL=$CALC_LEVEL" >> $GITHUB_ENV + + # Print Final Environment + echo "::group::Environment & Parameters" + echo "Final Test Level: $CALC_LEVEL" + echo "Event Name: ${{ github.event_name }}" + echo "Ref Name: ${{ github.ref_name }}" + echo "Base Ref: ${{ github.base_ref }}" + env | sort + echo "::endgroup::" + + - name: Select Docker Image Tag + id: select-image + run: | + # Determine config source + # Default we are fetching from 'dev' branch + CONFIG_BRANCH="dev" + + # If manual run requesting source config, switch branch + if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then + CONFIG_BRANCH="${{ github.ref_name }}" + echo "::notice::Debugging mode: Fetching config from current branch ($CONFIG_BRANCH)" + fi + + # Download config + CONFIG_URL="https://raw.githubusercontent.com/ROCm/TransformerEngine/${CONFIG_BRANCH}/ci/ci_config.json" + echo "Attempting to fetch image config from: $CONFIG_URL" + + if curl -s -f -o docker_config.json "$CONFIG_URL"; then + echo "Successfully downloaded config from $CONFIG_BRANCH." + else + echo "::warning::Failed to fetch config from $CONFIG_BRANCH (File might not exist yet)." + + # Fallback: Check source branch file + if [[ -f "ci/ci_config.json" ]]; then + echo "::notice::Falling back to local 'ci/ci_config.json' from checkout." + cp ci/ci_config.json docker_config.json + else + echo "::error::Config file not found in $CONFIG_BRANCH OR locally." + exit 1 + fi + fi + + # Determine image key + BRANCH_NAME="${{ github.base_ref || github.ref_name }}" + echo "Determining image for branch: $BRANCH_NAME" + + # Logic: Check if branch matches "release_vX.X". + # If so, look for that key in JSON. Otherwise default. + JSON_KEY="default" + + if [[ $BRANCH_NAME =~ ^release_v([0-9]+\.[0-9]+)_rocm$ ]]; then + VERSION_KEY="release_v${BASH_REMATCH[1]}" + # Check if this specific version key exists in the JSON + if [[ $(jq "(.docker_images | has(\"$VERSION_KEY\"))" docker_config.json) == "true" ]]; then + JSON_KEY="$VERSION_KEY" + fi + fi + + echo "Selected config key: $JSON_KEY" + + # Extract image name from json + IMAGE_TO_USE=$(jq -r ".docker_images.\"$JSON_KEY\"" docker_config.json) + + # Check input from workflow_dispatch overriding the image + MANUAL_OVERRIDE="${{ inputs.docker_image_override }}" + if [[ -n "$MANUAL_OVERRIDE" ]]; then + echo "::notice::Manual override detected: $MANUAL_OVERRIDE" + IMAGE_TO_USE="$MANUAL_OVERRIDE" + fi + + echo "Selected image: $IMAGE_TO_USE" + echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT + + - name: Pull Docker Image + run: | + docker pull ${{ steps.select-image.outputs.image-tag }} + + - name: Run Container + run: | + docker run -dt \ + --name te-runner \ + --network=host \ + --device=/dev/dri --device=/dev/kfd \ + --shm-size=16G \ + --pid=host \ + --group-add $(getent group render | cut -d: -f3) \ + --group-add $(getent group video | cut -d: -f3) \ + -v "${{ github.workspace }}:/workspace" \ + -w /workspace \ + ${{ steps.select-image.outputs.image-tag}} + + - name: Container Diagnostics & GPU Setup + id: container-diag + run: | + echo "::group::Container Configuration" + # Check Shared Memory Size inside container + echo ">>> /dev/shm size:" + docker exec te-runner df -h /dev/shm + + # Check OS/Kernel inside container + echo ">>> Container OS:" + docker exec te-runner cat /etc/os-release | grep PRETTY_NAME + echo "::endgroup::" + + echo "::group::ROCm Diagnostics (Host vs Container)" + echo ">>> CONTAINER rocm-smi:" + docker exec te-runner rocm-smi || true + echo "::endgroup::" + + # Determine Architecture + # Run rocminfo inside the container and capture the output + ARCH=$(docker exec te-runner bash -c "rocminfo | grep -m 1 -oP 'gfx[0-9a-fA-F]+'") + + if [ -z "$ARCH" ]; then + echo "::error::Could not determine GPU architecture using rocminfo inside the container." + docker exec te-runner rocminfo + exit 1 + fi + + echo "Detected GPU Arch: $ARCH" + echo "arch=$ARCH" >> $GITHUB_OUTPUT + + - name: Build Project + run: | + docker exec \ + -e GPU_ARCH=${{ steps.container-diag.outputs.arch }} \ + te-runner bash -c "$(cat <<'EOF' + set -ex + + export HIP_PATH="" + export PYTORCH_ROCM_ARCH=$GPU_ARCH + export NVTE_ROCM_ARCH=$GPU_ARCH + export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts + pip install ninja + git config --global --add safe.directory '*' + pip install --no-build-isolation -v . 2>&1 + EOF + )" + + - name: Run sGPU tests + id: sgpu-tests + continue-on-error: true + run: | + # Cleanup previous failure markers if any. Don't actually do anything on k8s pods + rm -f FAIL_* + + docker exec \ + -e TEST_SGPU=1 \ + -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ + te-runner bash -c "$(cat <<'EOF' + #!/usr/bin/bash + set -x -o pipefail + ulimit -c 0 # Disable core dumps + + # debug output + ls -d /opt/rocm* + python --version + pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext" + + HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 & + torch_pid=$!; echo Pytorch test pid $! + + HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 & + jax_pid=$!; echo JAX test pid $! + + HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 & + core_pid=$!; echo Core test pid $! + + wait $core_pid; core_rc=$? + wait $jax_pid; jax_rc=$? + wait $torch_pid; torch_rc=$? + + # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later + # Check PyTorch + if [ $torch_rc -ne 0 ]; then + echo "::group::[FAILED] PyTorch sGPU Log" + cat /workspace/torch_sgpu.log + echo "::endgroup::" + echo "::error::Pytorch sGPU test FAILED." + touch /workspace/FAIL_TORCH_SGPU + fi + + # Check JAX + if [ $jax_rc -ne 0 ]; then + echo "::group::[FAILED] JAX sGPU Log" + cat /workspace/jax_sgpu.log + echo "::endgroup::" + echo "::error::JAX sGPU test FAILED." + touch /workspace/FAIL_JAX_SGPU + fi + + # Check Core + if [ $core_rc -ne 0 ]; then + echo "::group::[FAILED] Core sGPU Log" + cat /workspace/core_sgpu.log + echo "::endgroup::" + echo "::error::Core sGPU test FAILED." + touch /workspace/FAIL_CORE_SGPU + fi + + test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0 + EOF + )" + + # Export failed tests statuses to host runner + if [ -f FAIL_TORCH_SGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi + if [ -f FAIL_JAX_SGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi + if [ -f FAIL_CORE_SGPU ]; then echo "core=fail" >> $GITHUB_OUTPUT; fi + + - name: Run mGPU tests + id: mgpu-tests + continue-on-error: true + run: | + docker exec \ + -e TEST_MGPU=1 \ + -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ + te-runner bash -c "$(cat <<'EOF' + #!/usr/bin/bash + set -x -o pipefail + ulimit -c 0 # Disable core dumps + + # Run PyTorch + ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1 + torch_rc=$? + + # Run JAX + ci/jax.sh > /workspace/jax_mgpu.log 2>&1 + jax_rc=$? + + # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later + if [ $torch_rc -ne 0 ]; then + echo "::group::[FAILED] PyTorch mGPU Log" + cat /workspace/torch_mgpu.log + echo "::endgroup::" + echo "::error::Pytorch mGPU test FAILED." + touch /workspace/FAIL_TORCH_MGPU + fi + + if [ $jax_rc -ne 0 ]; then + echo "::group::[FAILED] JAX mGPU Log" + cat /workspace/jax_mgpu.log + echo "::endgroup::" + echo "::error::JAX mGPU test FAILED." + touch /workspace/FAIL_JAX_MGPU + fi + + test $torch_rc -eq 0 -a $jax_rc -eq 0 + EOF + )" + + # Export failed tests statuses to host runner + if [ -f FAIL_TORCH_MGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi + if [ -f FAIL_JAX_MGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi + + - name: Run Examples + id: examples-tests + continue-on-error: true + run: | + docker exec te-runner bash -c "$(cat <<'EOF' + #!/usr/bin/bash + set -ex -o pipefail + ulimit -c 0 # Disable core dumps + + cd /workspace/examples/pytorch/mnist + python main.py 2>&1 | tee /workspace/examples.log + python main.py --use-te 2>&1 | tee -a /workspace/examples.log + python main.py --use-fp8 2>&1 | tee -a /workspace/examples.log + + cd /workspace/examples/jax/mnist + pip3 install -r requirements.txt + python test_single_gpu_mnist.py 2>&1 | tee -a /workspace/examples.log + python test_single_gpu_mnist.py --use-te 2>&1 | tee -a /workspace/examples.log + python test_single_gpu_mnist.py --use-fp8 2>&1 | tee -a /workspace/examples.log + + cd /workspace/examples/jax/encoder + pip3 install -r requirements.txt + python test_single_gpu_encoder.py 2>&1 | tee -a /workspace/examples.log + python test_single_gpu_encoder.py --use-fp8 2>&1 | tee -a /workspace/examples.log + EOF + )" + + - name: Check Test Failure Status + if: always() + run: | + EXIT_STATUS=0 + # Check outcomes of the specific test steps + # "outcome" will be 'failure' even if continue-on-error was true + + # sGPU CHECKS + # We check for the file existence directly because the 'Run sGPU tests' step + # halts immediately on docker failure, skipping the lines that set step outputs. + if [[ -f FAIL_CORE_SGPU ]]; then + echo "::error::Core sGPU Tests Failed." + EXIT_STATUS=1 + fi + if [[ -f FAIL_TORCH_SGPU ]]; then + echo "::error::PyTorch sGPU Tests Failed." + EXIT_STATUS=1 + fi + if [[ -f FAIL_JAX_SGPU ]]; then + echo "::error::JAX sGPU Tests Failed." + EXIT_STATUS=1 + fi + + # mGPU CHECKS + if [[ -f FAIL_TORCH_MGPU ]]; then + echo "::error::PyTorch mGPU Tests Failed." + EXIT_STATUS=1 + fi + if [[ -f FAIL_JAX_MGPU ]]; then + echo "::error::JAX mGPU Tests Failed." + EXIT_STATUS=1 + fi + + # EXAMPLES CHECK + # Examples script does not use marker files, so we rely on step outcome + if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then + echo "::error::Example Tests Failed." + EXIT_STATUS=1 + fi + + # Fail the job if any errors were detected + if [[ "$EXIT_STATUS" == "1" ]]; then + exit 1 + fi + + - name: Copy logs and reports from container + if: always() + run: | + docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log || true + docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log || true + docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log || true + docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log || true + docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log || true + + - name: Upload logs and test reports + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-and-reports-${{ matrix.runner }} + path: | + *.log + if-no-files-found: ignore + retention-days: 5 + + - name: Cleanup container + if: always() + run: docker rm -f te-runner || true \ No newline at end of file diff --git a/ci/ci_config.json b/ci/ci_config.json new file mode 100644 index 000000000..f87079608 --- /dev/null +++ b/ci/ci_config.json @@ -0,0 +1,7 @@ +{ + "docker_images": { + "default": "registry-sc-harbor.amd.com/framework/te-ci:rocm-7.1.1_ubuntu22.04_py3.11_pytorch_release_2.8_63e525b2_jax_0.7.1_fa-2.8.0", + "release_v1.13": "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273", + "release_v1.14": "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273" + } + } \ No newline at end of file