diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
index d12e01544..aaecf9810 100644
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@@ -1,5 +1,15 @@
 name: ete_test
+
+permissions:
+  contents: write
+  pages: write
+  id-token: write
+
 on:
+  pull_request:
+    branches:
+      - "main"
+      - "refactor"
   workflow_dispatch:
     inputs:
       repo_org:
@@ -19,18 +29,40 @@ jobs:
   ete_test:
     if: ${{!cancelled() }}
     runs-on: [h_cluster_ete]
+    permissions:
+      contents: write
+      pages: write
+      id-token: write
     steps:
       - name: Clean workdir
         run: sudo git clean -ffdx
       - name: Clone repository
         uses: actions/checkout@v2
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/xtuner' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
+          #with:
+          #repository: ${{ github.event.inputs.repo_org || 'InternLM/xtuner' }}
+          #ref: ${{github.event.inputs.repo_ref || 'main'}}
       - name: run-test
         run: |
           source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate
           conda activate clusterx 
           conda env list
           unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
-          pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }}
+          pytest autotest/test_all.py::test_all[qwen3-sft] -m all -n 1 -vv --run_id ${{ github.run_id }}
+
+      - name: Upload Artifacts
+        if: ${{ !cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: ${{ github.workspace }}/${{ github.run_id }}
+          if-no-files-found: ignore
+          retention-days: 7
+          name: xtuner-e2e-${{ github.run_id }}
+
+      - name: Deploy to GitHub Pages
+        if: ${{ !cancelled() }}
+        uses: JamesIves/github-pages-deploy-action@v4
+        with:
+          token: ${{ github.token }}
+          branch: gh-pages
+          folder: ./${{ github.run_id }}
+          target-folder: ${{ github.run_id }}
diff --git a/autotest/module/train.py b/autotest/module/train.py
index 410ccf1fe..db6495e76 100644
--- a/autotest/module/train.py
+++ b/autotest/module/train.py
@@ -56,7 +56,7 @@ def validate(config):
         )
         cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/rank0/tracker.jsonl")
         check_metrics = config.get("assert_info", {}).get("check_metrics", {})
-        return check_result(base_path, cur_path, check_metrics)
+        return check_result(config["case_name"], base_path, cur_path, check_metrics)
 
     def pre_action(config=None):
         action_info = config.get("pre_action", None)
diff --git a/autotest/utils/check_metric.py b/autotest/utils/check_metric.py
index cc2b6d57d..2c3ec0ec5 100644
--- a/autotest/utils/check_metric.py
+++ b/autotest/utils/check_metric.py
@@ -1,8 +1,12 @@
 import json
 import logging
+import os
+import shutil
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
 from statistics import mean
 
-
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 )
@@ -21,8 +25,76 @@ def extract_value(file, metrics):
 
     return total_step, metric_all
 
+def plot_all(case_name, check_metric, base_metrics, cur_metrics, output_root: Path):
+    metric_list = list(check_metric.keys())
+    n_plots = len(metric_list)
+    n_cols = int(np.ceil(np.sqrt(n_plots)))
+    n_rows = int(np.ceil(n_plots / n_cols))
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3))
+    axes = np.array(axes).flatten()
+
+    for i, ax in enumerate(axes):
+        if i < n_plots:
+            x_base = np.arange(len(base_metrics[metric_list[i]]))
+            x_current = np.arange(len(cur_metrics[metric_list[i]]))
+            ax.plot(
+                x_base,
+                base_metrics[metric_list[i]],
+                "r--",
+                label="Base",
+                marker="x",
+                markersize=4,
+            )
+            ax.plot(
+                x_current,
+                cur_metrics[metric_list[i]],
+                "b-",
+                label="Current",
+                marker="o",
+                markersize=4,
+            )
+            ax.set_title(f"{metric_list[i].replace('/', '_')}_comparison")
+            ax.set_xlabel("Step")
+            ax.set_ylabel("Value")
+            ax.legend()
+            ax.grid(True, linestyle="--", alpha=0.7)
+        else:
+            ax.axis("off")
+    fig.suptitle(f"{case_name}_metrics_comparison", fontsize=16)
+    plt.tight_layout()
+    plt.savefig(output_root / f"{case_name}_comparison.png")
+    plt.close()
+
+
+def write_to_summary(case_name, base_jsonl, cur_jsonl ):
+
+    summary_file = os.environ.get('GITHUB_STEP_SUMMARY', './tmp.md')
+    with open(summary_file, 'a') as f:
+        f.write(f"## {case_name}指标比较图\n")
+        f.write('<div align="center">\n')
+        f.write(f'<img src="https://{os.environ["GITHUB_REPOSITORY_OWNER"]}.github.io/xtuner/{os.environ["GITHUB_RUN_ID"]}/{case_name}_comparison.png"\n')
+        f.write('  style="max-width: 90%; border: 1px solid #ddd; border-radius: 8px;">\n')
+        f.write('</div>\n<div align=center>\n')
+        f.write(f'<details>\n<summary><strong>📊 点击查看用例{case_name}指标数据，依次为基线、当前版本数据</strong></summary>\n\n')
 
-def check_result(base_path, cur_path, check_metric):
+    for json_f in [base_jsonl, cur_jsonl]:
+        with open(json_f, 'r', encoding='utf-8') as f:
+            lines = [line.strip() for line in f if line.strip()]
+
+        md_content = '```json\n'
+        for i, line in enumerate(lines, 1):
+            md_content += f'{line}\n'
+
+        md_content += '```\n\n'
+
+
+        with open(summary_file, 'a', encoding='utf-8') as f:
+            f.write(md_content)
+    with open(summary_file, 'a') as f:
+        f.write('</details>\n')
+
+
+def check_result(case_name, base_path, cur_path, check_metric):
     fail_metric = {}
     check_metric = check_metric
     metric_list = list(check_metric.keys())
@@ -32,6 +104,12 @@ def check_result(base_path, cur_path, check_metric):
         f"current steps is not equal to base steps, current steps: {cur_steps}, base steps: {base_steps}"
     )
 
+    output_path = Path(f"../{os.environ['GITHUB_RUN_ID']}")
+    output_path.mkdir(parents=True, exist_ok=True)
+    plot_all(case_name, check_metric, base_metrics, cur_metrics, output_path)
+    shutil.copytree(output_path, f"./{os.environ['GITHUB_RUN_ID']}", dirs_exist_ok=True)
+    write_to_summary(case_name, base_path, cur_path)
+
     for metric, threshold in check_metric.items():
         max_error = 0.0
         max_error_idx = 0
@@ -75,4 +153,4 @@ def check_result(base_path, cur_path, check_metric):
     return result, f"Some metric check failed,{fail_metric}"
 
 if __name__ == "__main__":
-    print(check_result("./base//tracker.jsonl","./current/tracker.jsonl",{"grad_norm":0.000001,"loss/reduced_llm_loss":0.000001,"lr":0,"memory/max_memory_GB":0.2,"runtime_info/tgs":0.05,"runtime_info/text_tokens":0}))
+    print(check_result("qwen3-sft", "./base/tracker.jsonl", "./current/tracker.jsonl",{"grad_norm":0.000001,"loss/reduced_llm_loss":0.000001,"lr":0,"memory/max_memory_GB":0.2,"runtime_info/tgs":0.05,"runtime_info/text_tokens":0}))