From 5e0ee6989b360bb99be648cee41be15421338f97 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Thu, 15 Jan 2026 22:09:33 +0800
Subject: [PATCH 01/17] add eval_backend_perf

---
 graph_net_bench/torch/eval_backend_diff.py | 419 +++++----------------
 graph_net_bench/torch/eval_backend_perf.py | 337 +++++++++++++++++
 graph_net_bench/torch/utils.py             |  11 +
 test/eval_backend_diff_test.sh             |  13 +-
 4 files changed, 458 insertions(+), 322 deletions(-)
 create mode 100644 graph_net_bench/torch/eval_backend_perf.py

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 8488b71b7..07a19ff88 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -1,103 +1,18 @@
 from . import utils
 import subprocess
 import argparse
-import importlib.util
 import torch
-from pathlib import Path
-from typing import Type
 import sys
 import os
 import os.path
 import traceback
 import json
-import random
-import numpy as np
-import platform
 import base64
-from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
-from graph_net_bench.torch.backend.tvm_backend import TvmBackend
-from graph_net_bench.torch.backend.xla_backend import XlaBackend
-from graph_net_bench.torch.backend.inductor_backend import InductorBackend
-from graph_net_bench.torch.backend.tensorrt_backend import TensorRTBackend
-from graph_net_bench.torch.backend.blade_disc_backend import BladeDISCBackend
-from graph_net_bench.torch.backend.nope_backend import NopeBackend
-from graph_net_bench.torch.backend.pass_mgr_backend import PassMgrBackend
-from graph_net_bench.torch.backend.unstable_to_stable_backend import (
-    UnstableToStableBackend,
-)
-from graph_net_bench.torch.backend.range_decomposer_validator_backend import (
-    RangeDecomposerValidatorBackend,
-)
-from graph_net_bench.torch.backend.graph_variable_renamer_validator_backend import (
-    GraphVariableRenamerValidatorBackend,
-)
+import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
 
 
-compiler_backend_name2class = {
-    "tvm": TvmBackend,
-    "xla": XlaBackend,
-    "inductor": InductorBackend,
-    "tensorrt": TensorRTBackend,
-    "bladedisc": BladeDISCBackend,
-    "nope": NopeBackend,
-    "pass_mgr": PassMgrBackend,
-    "unstable_to_stable": UnstableToStableBackend,
-    "range_decomposer_validator": RangeDecomposerValidatorBackend,
-    "graph_variable_renamer_validator": GraphVariableRenamerValidatorBackend,
-}
-
-
-def set_seed(random_seed):
-    random.seed(random_seed)
-    np.random.seed(random_seed)
-    torch.manual_seed(random_seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(random_seed)
-        torch.cuda.manual_seed_all(random_seed)
-
-
-def get_hardward_name(args):
-    hardware_name = "unknown"
-    if "cuda" in args.device:
-        hardware_name = torch.cuda.get_device_name(args.device)
-    elif args.device == "cpu":
-        hardware_name = platform.processor()
-    return hardware_name
-
-
-def get_compile_framework_version(args):
-    if args.compiler in ["inductor", "nope", "unstable_to_stable"]:
-        return torch.__version__
-    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
-        # Assuming compiler object has a version attribute
-        return f"{args.compiler.capitalize()} {args.compiler.version}"
-    return "unknown"
-
-
-def load_class_from_file(
-    args: argparse.Namespace, class_name: str, device: str
-) -> Type[torch.nn.Module]:
-    file_path = f"{args.model_path}/model.py"
-    file = Path(file_path).resolve()
-    module_name = file.stem
-
-    with open(file_path, "r", encoding="utf-8") as f:
-        model_code = f.read()
-    model_code = utils.modify_code_by_device(model_code, device)
-    spec = importlib.util.spec_from_loader(module_name, loader=None)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    compiled_code = compile(model_code, filename=file, mode="exec")
-    exec(compiled_code, module.__dict__)
-
-    model_class = getattr(module, class_name, None)
-    setattr(model_class, "__graph_net_file_path__", file_path)
-    setattr(model_class, "__graph_net_device__", device)
-    return model_class
-
-
 def convert_to_dict(config_str):
     if config_str in {None, "", "null", "None"}:
         return {}
@@ -107,203 +22,6 @@ def convert_to_dict(config_str):
     return config
 
 
-def get_compiler_backend(args) -> GraphCompilerBackend:
-    assert (
-        args.compiler in compiler_backend_name2class
-    ), f"Unknown compiler: {args.compiler}"
-    backend_class = compiler_backend_name2class[args.compiler]
-    return backend_class(args.backend_config)
-
-
-def get_model(args):
-    device = "xla" if args.compiler == "xla" else args.device
-
-    # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
-    model_class = load_class_from_file(args, class_name="GraphModule", device=device)
-    model = model_class().to(torch.device(args.device))
-    return model
-
-
-def get_input_dict(args):
-    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
-    params = inputs_params["weight_info"]
-    for tensor_meta in params.values():
-        if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = args.device
-    return {
-        k: utils.replay_tensor(v).to(torch.device(args.device))
-        for k, v in params.items()
-    }
-
-
-def measure_performance(model_call, args, compiler):
-    stats = {}
-    outs = model_call()
-
-    # Warmup runs
-    for _ in range(args.warmup):
-        model_call()
-    compiler.synchronize()
-
-    hardware_name = get_hardward_name(args)
-    print(
-        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
-        file=sys.stderr,
-        flush=True,
-    )
-
-    if "cuda" in args.device:
-        """
-        Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
-        With reference to methods only based on CUDA events from KernelBench in https://github.com/ScalingIntelligence/KernelBench
-        """
-
-        e2e_times = []
-        gpu_times = []
-
-        for i in range(args.trials):
-            # End-to-end timing (naive_timer)
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                # GPU-only timing (CUDA Events)
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                start_event.record()
-
-                model_call()
-
-                end_event.record()
-                compiler.synchronize()
-
-            gpu_time_ms = start_event.elapsed_time(end_event)
-            e2e_times.append(duration_box.value)
-            gpu_times.append(gpu_time_ms)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
-
-    else:  # CPU or other devices
-        e2e_times = []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                model_call()
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-            e2e_times.append(duration_box.value)
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-
-    return outs, stats
-
-
-def test_single_model(args):
-    compiler = get_compiler_backend(args)
-    input_dict = get_input_dict(args)
-    model = get_model(args)
-    model_path = os.path.normpath(args.model_path)
-    test_compiler_util.print_with_log_prompt(
-        "[Processing]", model_path, args.log_prompt
-    )
-    test_compiler_util.print_basic_config(
-        args, get_hardward_name(args), get_compile_framework_version(args)
-    )
-
-    runtime_seed = 1024
-    eager_failure = False
-    expected_out = None
-    eager_time_stats = {}
-
-    try:
-
-        def eager_model_call():
-            return model(**input_dict)
-
-        expected_out, eager_time_stats = measure_performance(
-            eager_model_call, args, compiler
-        )
-
-        torch.manual_seed(runtime_seed)
-        if not isinstance(expected_out, tuple):
-            expected_out = (expected_out,)
-    except (TypeError, RuntimeError) as e:
-        print(f"Eager model execution failed: {str(e)}", file=sys.stderr)
-        eager_failure = True
-
-    compiled_failure = False
-    compiled_model = None
-    compiled_time_stats = {}
-
-    try:
-        compiled_model = compiler(model)
-        torch.manual_seed(runtime_seed)
-
-        def compiled_model_call():
-            return compiled_model(**input_dict)
-
-        compiled_out, compiled_time_stats = measure_performance(
-            compiled_model_call, args, compiler
-        )
-
-        if not isinstance(compiled_out, tuple):
-            compiled_out = (compiled_out,)
-        if args.compiler == "xla":
-            compiled_out = tuple(item.to("cpu").to("cuda") for item in compiled_out)
-    except (TypeError, RuntimeError) as e:
-        print(f"Compiled model execution failed: {str(e)}", file=sys.stderr)
-        compiled_failure = True
-        print("\n--- Full Traceback ---")
-        traceback.print_exc()
-        print(f"debug-model-execution {type(e).__name__} {args.model_path}", flush=True)
-    except Exception as e:
-        compiled_failure = True
-        print("\n--- Full Traceback ---")
-        traceback.print_exc()
-        print(f"debug-model-execution {type(e).__name__} {args.model_path}", flush=True)
-
-    if eager_failure:
-        print(f"{args.log_prompt} [Result] status: failed", file=sys.stderr, flush=True)
-        print(
-            f"{args.log_prompt} [Fail due to eager model execution error.]",
-            file=sys.stderr,
-            flush=True,
-        )
-    elif compiled_failure:
-        print(f"{args.log_prompt} [Result] status: failed", file=sys.stderr, flush=True)
-        print(
-            f"{args.log_prompt} [Fail due to compiled model execution error.]",
-            file=sys.stderr,
-            flush=True,
-        )
-    else:
-        compare_correctness(expected_out, compiled_out, args)
-
-        print(
-            f"{args.log_prompt} [Result] status: success", file=sys.stderr, flush=True
-        )
-
-        test_compiler_util.print_times_and_speedup(
-            args, eager_time_stats, compiled_time_stats
-        )
-
-
-def print_and_store_cmp(key, cmp_func, args, expected_out, compiled_out, **kwargs):
-    cmp_ret = cmp_func(expected_out, compiled_out, **kwargs)
-    print(
-        f"{args.log_prompt} [Correctness]{key}: {cmp_ret}",
-        file=sys.stderr,
-        flush=True,
-    )
-    return cmp_ret
-
-
 def compare_correctness(expected_out, compiled_out, args):
     eager_dtypes = [
         (
@@ -386,13 +104,24 @@ def get_cmp_diff_count(expected_out, compiled_out, atol, rtol):
     return " ".join(results)
 
 
-def get_sample_root(args):
-    return args.model_path_prefix
+def parse_time_stats_from_reference_log(log_path):
+    assert os.path.isfile(
+        log_path
+    ), f"{log_path} does not exist or is not a regular file."
+
+    with open(log_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+        for line in reversed(lines):
+            if "[Performance][eager]" in line:
+                start = line.find("{")
+                end = line.rfind("}")
+                time_stats = json.loads(line[start : end + 1])
+    return time_stats
 
 
-def test_multi_models(args):
+def eval_multi_models(args, model_path_prefix):
     test_samples = test_compiler_util.get_allow_samples(
-        args.model_path_list, get_sample_root(args)
+        args.model_path_list, model_path_prefix
     )
 
     sample_idx = 0
@@ -435,15 +164,15 @@ def test_multi_models(args):
         print(f"- {model_path}", file=sys.stderr, flush=True)
 
 
-def test_multi_models_with_prefix(args):
-    assert os.path.isdir(args.model_path_prefix)
+def eval_multi_models_with_prefix(args, model_path_prefix):
+    assert os.path.isdir(model_path_prefix)
     assert os.path.isfile(args.model_path_list)
     test_samples = test_compiler_util.get_allow_samples(
-        args.model_path_list, get_sample_root(args)
+        args.model_path_list, model_path_prefix
     )
     py_module_name = os.path.splitext(os.path.basename(__file__))[0]
     for rel_model_path in test_samples:
-        model_path = os.path.join(args.model_path_prefix, rel_model_path)
+        model_path = os.path.join(model_path_prefix, rel_model_path)
         if not os.path.exists(model_path):
             continue
         if not os.path.exists(os.path.join(model_path, "model.py")):
@@ -467,39 +196,92 @@ def test_multi_models_with_prefix(args):
             traceback.print_exc()
 
 
+def compare_perf_diff(args, model_path, ref_dir, target_dir):
+    # A
+    ref_dump_path = utils.get_output_path(ref_dir, model_path)
+    ref_out = torch.load(str(ref_dump_path))
+
+    ref_log_path = utils.get_log_path(ref_dir, model_path)
+    ref_time_stats = parse_time_stats_from_reference_log(ref_log_path)
+
+    # B
+    target_dump_path = utils.get_output_path(target_dir, model_path)
+    target_out = torch.load(str(target_dump_path))
+
+    target_log_path = utils.get_log_path(target_dir, model_path)
+    target_time_stats = parse_time_stats_from_reference_log(target_log_path)
+
+    compare_correctness(ref_out, target_out, args)
+
+    test_compiler_util.print_times_and_speedup(args, ref_time_stats, target_time_stats)
+
+
+def eval_single_model(args):
+    ref_dir = "/tmp/eval_perf_diff/A"
+    target_dir = "/tmp/eval_perf_diff/B"
+
+    EvalCfg = types.SimpleNamespace(
+        ref_env=types.SimpleNamespace(**convert_to_dict(args.config)["ref_env"]),
+        target_env=types.SimpleNamespace(**convert_to_dict(args.config)["target_env"]),
+    )
+
+    ref_args = build_sub_args(EvalCfg.ref_env)
+    target_args = build_sub_args(EvalCfg.target_env)
+
+    run_sub_process(ref_args, args.model_path, ref_dir)
+    run_sub_process(target_args, args.model_path, target_dir)
+    compare_perf_diff(ref_args, args.model_path, ref_dir, target_dir)
+
+
+def run_sub_process(env_args, model_path, output_path):
+    cmd = [sys.executable, "-m", "graph_net_bench.torch.eval_backend_perf"]
+    args_pairs = [
+        ("--model-path", model_path),
+        ("--output-path", output_path),
+        ("--seed", str(env_args.seed)),
+        ("--compiler", env_args.compiler),
+        ("--device", env_args.device),
+        ("--op-lib", env_args.op_lib),
+        ("--warmup", str(env_args.warmup)),
+        ("--trials", str(env_args.trials)),
+        ("--log-prompt", env_args.log_prompt),
+        ("--model-path-prefix", env_args.model_path_prefix),
+        ("--config", env_args.backend_config),
+    ]
+
+    for arg_name, arg_value in args_pairs:
+        if arg_value is not None:
+            cmd.extend([arg_name, arg_value])
+
+    subprocess.run(cmd, check=True)
+
+
+def build_sub_args(env_ns: types.SimpleNamespace) -> argparse.Namespace:
+    sub = argparse.Namespace()
+    sub.seed = getattr(env_ns, "seed", 123)
+    sub.compiler = getattr(env_ns, "compiler", None)
+    sub.device = getattr(env_ns, "device", None)
+    sub.op_lib = getattr(env_ns, "op_lib", None)
+    sub.warmup = getattr(env_ns, "warmup", 3)
+    sub.trials = getattr(env_ns, "trials", 5)
+    sub.log_prompt = getattr(env_ns, "log_prompt", None)
+    sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
+    sub.backend_config = getattr(env_ns, "backend_config", None)
+    return sub
+
+
 def main(args):
-    if args.model_path_list is not None and args.model_path_prefix is not None:
-        test_multi_models_with_prefix(args)
+    config_dict = convert_to_dict(args.config)
+    model_path_prefix = config_dict["ref_env"]["model_path_prefix"]
+    if args.model_path_list is not None and model_path_prefix is not None:
+        eval_multi_models_with_prefix(args, model_path_prefix)
         return
     assert os.path.isdir(args.model_path)
 
-    initalize_seed = 123
-    set_seed(random_seed=initalize_seed)
-
     if path_utils.is_single_model_dir(args.model_path):
-        test_single_model(args)
+        eval_single_model(args)
     else:
-        test_multi_models(args)
-
-
-def complete_default_args(
-    mut_args,
-    compiler: str = "inductor",  # Compiler name
-    device: str = "cuda",  # Device for testing the compiler (e.g., 'cpu' or 'cuda')
-    warmup: int = 3,  # Number of warmup steps
-    trials: int = 5,  # Number of timing trials
-    log_prompt: str = "graph-net-test-compiler-log",  # Log prompt for performance log filtering
-    model_path_prefix: str = None,  # Prefix path to model path in --model-path-list
-    backend_config: dict = None,  # backend configuration json
-):
-    backend_config = backend_config if backend_config is not None else {}
-    mut_args.compiler = compiler
-    mut_args.device = device
-    mut_args.warmup = warmup
-    mut_args.trials = trials
-    mut_args.log_prompt = log_prompt
-    mut_args.model_path_prefix = model_path_prefix
-    mut_args.backend_config = backend_config
+        eval_multi_models(args, model_path_prefix)
 
 
 if __name__ == "__main__":
@@ -526,5 +308,4 @@ def complete_default_args(
         help="base64 encode configuration json.",
     )
     args = parser.parse_args()
-    complete_default_args(args, **convert_to_dict(args.config))
     main(args=args)
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
new file mode 100644
index 000000000..7e12f6ebf
--- /dev/null
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -0,0 +1,337 @@
+from . import utils
+import argparse
+import importlib.util
+import torch
+from pathlib import Path
+from typing import Type
+import sys
+import os
+import traceback
+import json
+import random
+import numpy as np
+import platform
+import base64
+from contextlib import redirect_stdout, redirect_stderr
+
+from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
+from graph_net_bench.torch.backend.tvm_backend import TvmBackend
+from graph_net_bench.torch.backend.xla_backend import XlaBackend
+from graph_net_bench.torch.backend.inductor_backend import InductorBackend
+from graph_net_bench.torch.backend.tensorrt_backend import TensorRTBackend
+from graph_net_bench.torch.backend.blade_disc_backend import BladeDISCBackend
+from graph_net_bench.torch.backend.nope_backend import NopeBackend
+from graph_net_bench.torch.backend.pass_mgr_backend import PassMgrBackend
+from graph_net_bench.torch.backend.unstable_to_stable_backend import (
+    UnstableToStableBackend,
+)
+from graph_net_bench.torch.backend.range_decomposer_validator_backend import (
+    RangeDecomposerValidatorBackend,
+)
+from graph_net_bench.torch.backend.graph_variable_renamer_validator_backend import (
+    GraphVariableRenamerValidatorBackend,
+)
+from graph_net_bench import test_compiler_util
+
+
+compiler_backend_name2class = {
+    "tvm": TvmBackend,
+    "xla": XlaBackend,
+    "inductor": InductorBackend,
+    "tensorrt": TensorRTBackend,
+    "bladedisc": BladeDISCBackend,
+    "nope": NopeBackend,
+    "pass_mgr": PassMgrBackend,
+    "unstable_to_stable": UnstableToStableBackend,
+    "range_decomposer_validator": RangeDecomposerValidatorBackend,
+    "graph_variable_renamer_validator": GraphVariableRenamerValidatorBackend,
+}
+
+
+def register_op_lib(op_lib):
+    if op_lib == "flaggems":
+        import flag_gems
+
+        flag_gems.enable()
+    else:
+        pass
+
+
+def set_seed(random_seed):
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    torch.manual_seed(random_seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(random_seed)
+        torch.cuda.manual_seed_all(random_seed)
+
+
+def get_hardward_name(args):
+    hardware_name = "unknown"
+    if "cuda" in args.device:
+        hardware_name = torch.cuda.get_device_name(args.device)
+    elif args.device == "cpu":
+        hardware_name = platform.processor()
+    return hardware_name
+
+
+def get_compile_framework_version(args):
+    if args.compiler in ["inductor", "nope", "unstable_to_stable"]:
+        return torch.__version__
+    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
+        # Assuming compiler object has a version attribute
+        return f"{args.compiler.capitalize()} {args.compiler.version}"
+    return "unknown"
+
+
+def load_class_from_file(
+    args: argparse.Namespace, class_name: str, device: str
+) -> Type[torch.nn.Module]:
+    file_path = f"{args.model_path}/model.py"
+    file = Path(file_path).resolve()
+    module_name = file.stem
+
+    with open(file_path, "r", encoding="utf-8") as f:
+        model_code = f.read()
+    model_code = utils.modify_code_by_device(model_code, device)
+    spec = importlib.util.spec_from_loader(module_name, loader=None)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    compiled_code = compile(model_code, filename=file, mode="exec")
+    exec(compiled_code, module.__dict__)
+
+    model_class = getattr(module, class_name, None)
+    setattr(model_class, "__graph_net_file_path__", file_path)
+    setattr(model_class, "__graph_net_device__", device)
+    return model_class
+
+
+def convert_to_dict(config_str):
+    if config_str is None or config_str == "None":
+        return {}
+    config_str = base64.b64decode(config_str).decode("utf-8")
+    config = json.loads(config_str)
+    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
+    return config
+
+
+def get_compiler_backend(args) -> GraphCompilerBackend:
+    assert (
+        args.compiler in compiler_backend_name2class
+    ), f"Unknown compiler: {args.compiler}"
+    backend_class = compiler_backend_name2class[args.compiler]
+    config = convert_to_dict(args.config) if args.config is not None else {}
+    return backend_class(config)
+
+
+def get_model(args):
+    device = "xla" if args.compiler == "xla" else args.device
+
+    # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
+    model_class = load_class_from_file(args, class_name="GraphModule", device=device)
+    model = model_class().to(torch.device(args.device))
+    return model
+
+
+def get_input_dict(args):
+    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
+    params = inputs_params["weight_info"]
+    for tensor_meta in params.values():
+        if "device" in tensor_meta["info"]:
+            tensor_meta["info"]["device"] = args.device
+    return {
+        k: utils.replay_tensor(v).to(torch.device(args.device))
+        for k, v in params.items()
+    }
+
+
+def measure_performance(model_call, args, compiler):
+    stats = {}
+    outs = model_call()
+
+    # Warmup runs
+    for _ in range(args.warmup):
+        model_call()
+    compiler.synchronize()
+
+    hardware_name = get_hardward_name(args)
+    print(
+        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
+        file=sys.stderr,
+        flush=True,
+    )
+
+    if "cuda" in args.device:
+        torch.cuda.empty_cache()
+        e2e_times = []
+        gpu_times = []
+
+        for i in range(args.trials):
+            # End-to-end timing (naive_timer)
+            duration_box = test_compiler_util.DurationBox(-1)
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+                # GPU-only timing (CUDA Events)
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                start_event.record()
+
+                model_call()
+
+                end_event.record()
+                compiler.synchronize()
+
+            gpu_time_ms = start_event.elapsed_time(end_event)
+            e2e_times.append(duration_box.value)
+            gpu_times.append(gpu_time_ms)
+            print(
+                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
+                file=sys.stderr,
+                flush=True,
+            )
+
+        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
+
+    else:  # CPU or other devices
+        e2e_times = []
+        for i in range(args.trials):
+            duration_box = test_compiler_util.DurationBox(-1)
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+                model_call()
+            print(
+                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
+                file=sys.stderr,
+                flush=True,
+            )
+            e2e_times.append(duration_box.value)
+        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+
+    return outs, stats
+
+
+def eval_single_model(args):
+    log_path = utils.get_log_path(args.output_path, args.model_path)
+    output_dump_path = utils.get_output_path(args.output_path, args.model_path)
+    print(f"Log path: {log_path}", file=sys.stderr, flush=True)
+    print(f"Outputs path: {output_dump_path}", file=sys.stderr, flush=True)
+
+    with open(log_path, "w", encoding="utf-8") as log_f:
+        with redirect_stdout(log_f), redirect_stderr(log_f):
+            compiler = get_compiler_backend(args)
+
+            input_dict = get_input_dict(args)
+            model = get_model(args)
+            model.eval()
+
+            test_compiler_util.print_with_log_prompt(
+                "[Config] seed:", args.seed, args.log_prompt
+            )
+
+            test_compiler_util.print_basic_config(
+                args,
+                get_hardward_name(args),
+                get_compile_framework_version(args),
+            )
+
+            test_compiler_util.print_with_log_prompt(
+                "[Config] op_lib:", args.op_lib, args.log_prompt
+            )
+
+            success = False
+            time_stats = {}
+            try:
+                compiled_model = compiler(model)
+
+                def model_call():
+                    return compiled_model(**input_dict)
+
+                outputs, time_stats = measure_performance(model_call, args, compiler)
+                success = True
+            except Exception as e:
+                print(
+                    f"Run model failed: {str(e)}\n{traceback.format_exc()}",
+                    file=sys.stderr,
+                    flush=True,
+                )
+
+            test_compiler_util.print_running_status(args, success)
+            if success:
+                torch.save(outputs, str(output_dump_path))
+            test_compiler_util.print_with_log_prompt(
+                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
+            )
+
+    with open(log_path, "r", encoding="utf-8") as f:
+        content = f.read()
+        print(content, file=sys.stderr, flush=True)
+
+
+def main(args):
+    set_seed(args.seed)
+    os.makedirs(args.output_path, exist_ok=True)
+    eval_single_model(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="GraphNet Backend Performance Evaluation"
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to model file(s), each subdirectory containing graph_net.json will be regarded as a model",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        required=False,
+        default="/tmp/test_save",
+        help="Path to save outputs",
+    )
+    parser.add_argument("--seed", type=int, required=False, default=123)
+    parser.add_argument(
+        "--compiler",
+        type=str,
+        required=False,
+        default="inductor",
+        help="Path to customized compiler python file",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        required=False,
+        default="cuda",
+        help="Device for testing the compiler (e.g., 'cpu' or 'cuda')",
+    )
+    parser.add_argument("--op-lib", type=str, required=False, default=None)
+    parser.add_argument(
+        "--warmup", type=int, required=False, default=3, help="Number of warmup steps"
+    )
+    parser.add_argument(
+        "--trials", type=int, required=False, default=5, help="Number of timing trials"
+    )
+    parser.add_argument(
+        "--log-prompt",
+        type=str,
+        required=False,
+        default="graph-net-test-compiler-log",
+        help="Log prompt for performance log filtering.",
+    )
+    parser.add_argument(
+        "--model-path-prefix",
+        type=str,
+        required=False,
+        default=None,
+        help="Prefix path to model path list",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=False,
+        default=None,
+        help="base64 encode configuration json.",
+    )
+    args = parser.parse_args()
+    main(args=args)
diff --git a/graph_net_bench/torch/utils.py b/graph_net_bench/torch/utils.py
index c937ff4de..700a59972 100755
--- a/graph_net_bench/torch/utils.py
+++ b/graph_net_bench/torch/utils.py
@@ -1,4 +1,5 @@
 import torch
+import os
 import ast
 import math
 import inspect
@@ -7,6 +8,16 @@
 kLiteralTensorSize = 64
 
 
+def get_log_path(log_dir, model_path):
+    model_name = model_path.split("torch_samples/")[-1].replace(os.sep, "_")
+    return os.path.join(log_dir, f"{model_name}.log")
+
+
+def get_output_path(output_dir, model_path):
+    model_name = model_path.split("torch_samples/")[-1].replace(os.sep, "_")
+    return os.path.join(output_dir, f"{model_name}.pth")
+
+
 def get_limited_precision_float_str(value):
     if not isinstance(value, float):
         return value
diff --git a/test/eval_backend_diff_test.sh b/test/eval_backend_diff_test.sh
index e3fa79602..16da81903 100755
--- a/test/eval_backend_diff_test.sh
+++ b/test/eval_backend_diff_test.sh
@@ -10,9 +10,16 @@ python3 -m graph_net_bench.torch.eval_backend_diff \
     --model-path-list $model_list \
     --config $(base64 -w 0 <<EOF
 {
-    "model_path_prefix": "$AI4C_ROOT",
-    "compiler": "nope",
-    "device": "cuda"
+    "ref_env":  {
+        "compiler": "nope",
+        "device": "cuda",
+        "model_path_prefix": "$AI4C_ROOT"
+    },
+    "target_env": {
+        "compiler": "nope",
+        "device": "cuda",
+        "model_path_prefix": "$AI4C_ROOT"
+    }
 }
 EOF
 ) 2>&1 | tee "$OUTPUT_PATH/validation.log"

From f83ab0cfd1bfdb325c356a0604e9cb717e45abde Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 08:56:00 +0800
Subject: [PATCH 02/17] Simplify eval_multi_models

---
 graph_net_bench/torch/eval_backend_diff.py | 123 ++++++++++-----------
 1 file changed, 57 insertions(+), 66 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 07a19ff88..50d17cb62 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -119,81 +119,69 @@ def parse_time_stats_from_reference_log(log_path):
     return time_stats
 
 
-def eval_multi_models(args, model_path_prefix):
-    test_samples = test_compiler_util.get_allow_samples(
-        args.model_path_list, model_path_prefix
-    )
-
+def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
     sample_idx = 0
     failed_samples = []
     module_name = os.path.splitext(os.path.basename(__file__))[0]
-    for model_path in path_utils.get_recursively_model_path(args.model_path):
-        if test_samples is None or os.path.abspath(model_path) in test_samples:
-            print(
-                f"[{sample_idx}] {module_name}, model_path: {model_path}",
-                file=sys.stderr,
-                flush=True,
-            )
-            cmd = " ".join(
-                [
-                    sys.executable,
-                    f"-m graph_net_bench.torch.{module_name}",
-                    f"--model-path {model_path}",
-                    f"--config {args.config}",
-                ]
-            )
-            try:
-                process = subprocess.Popen(cmd, shell=True)
-                cmd_ret = process.wait()
-            except KeyboardInterrupt:
-                print("KeyboardInterrupt")
-                sys.exit(1)
-            except Exception:
-                print("\n--- Full Traceback ---")
-                traceback.print_exc()
-            if cmd_ret != 0:
-                failed_samples.append(model_path)
-            sample_idx += 1
-
-    print(
-        f"Totally {sample_idx} verified samples, failed {len(failed_samples)} samples.",
-        file=sys.stderr,
-        flush=True,
-    )
-    for model_path in failed_samples:
-        print(f"- {model_path}", file=sys.stderr, flush=True)
-
 
-def eval_multi_models_with_prefix(args, model_path_prefix):
-    assert os.path.isdir(model_path_prefix)
-    assert os.path.isfile(args.model_path_list)
-    test_samples = test_compiler_util.get_allow_samples(
-        args.model_path_list, model_path_prefix
-    )
-    py_module_name = os.path.splitext(os.path.basename(__file__))[0]
-    for rel_model_path in test_samples:
-        model_path = os.path.join(model_path_prefix, rel_model_path)
-        if not os.path.exists(model_path):
-            continue
-        if not os.path.exists(os.path.join(model_path, "model.py")):
-            continue
+    if use_model_list:
+        assert os.path.isdir(model_path_prefix)
+        assert os.path.isfile(args.model_path_list)
+        test_samples = test_compiler_util.get_allow_samples(
+            args.model_path_list, model_path_prefix
+        )
+        model_paths = []
+        for rel_model_path in test_samples:
+            model_path = os.path.join(model_path_prefix, rel_model_path)
+            if os.path.exists(model_path) and os.path.exists(
+                os.path.join(model_path, "model.py")
+            ):
+                model_paths.append(model_path)
+    else:
+        assert os.path.isdir(args.model_path)
+        test_samples = test_compiler_util.get_allow_samples(
+            args.model_path_list, model_path_prefix
+        )
+        model_paths = []
+        for model_path in path_utils.get_recursively_model_path(args.model_path):
+            if test_samples is None or os.path.abspath(model_path) in test_samples:
+                model_paths.append(model_path)
+
+    for model_path in model_paths:
+        print(
+            f"[{sample_idx}] {module_name}, model_path: {model_path}",
+            file=sys.stderr,
+            flush=True,
+        )
         cmd = " ".join(
             [
                 sys.executable,
-                f"-m graph_net_bench.torch.{py_module_name}",
+                f"-m graph_net_bench.torch.{module_name}",
                 f"--model-path {model_path}",
                 f"--config {args.config}",
             ]
         )
         try:
             process = subprocess.Popen(cmd, shell=True)
-            process.wait()
+            cmd_ret = process.wait()
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
             sys.exit(1)
         except Exception:
             print("\n--- Full Traceback ---")
             traceback.print_exc()
+        if cmd_ret != 0:
+            failed_samples.append(model_path)
+        sample_idx += 1
+
+    print(
+        f"Totally {sample_idx} verified samples, failed {len(failed_samples)} samples.",
+        file=sys.stderr,
+        flush=True,
+    )
+    if failed_samples:
+        for model_path in failed_samples:
+            print(f"- {model_path}", file=sys.stderr, flush=True)
 
 
 def compare_perf_diff(args, model_path, ref_dir, target_dir):
@@ -272,20 +260,23 @@ def build_sub_args(env_ns: types.SimpleNamespace) -> argparse.Namespace:
 
 def main(args):
     config_dict = convert_to_dict(args.config)
-    model_path_prefix = config_dict["ref_env"]["model_path_prefix"]
-    if args.model_path_list is not None and model_path_prefix is not None:
-        eval_multi_models_with_prefix(args, model_path_prefix)
-        return
-    assert os.path.isdir(args.model_path)
-
-    if path_utils.is_single_model_dir(args.model_path):
-        eval_single_model(args)
+    model_path_prefix = config_dict.get("ref_env", {}).get("model_path_prefix")
+
+    if args.model_path_list and model_path_prefix:
+        eval_multi_models(args, model_path_prefix, use_model_list=True)
+    elif os.path.isdir(args.model_path):
+        if path_utils.is_single_model_dir(args.model_path):
+            eval_single_model(args)
+        else:
+            eval_multi_models(args, model_path_prefix, use_model_list=False)
     else:
-        eval_multi_models(args, model_path_prefix)
+        raise ValueError(f"Invalid model path: {args.model_path}")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Test compiler performance.")
+    parser = argparse.ArgumentParser(
+        description="Evaluate backend performance difference."
+    )
     parser.add_argument(
         "--model-path",
         type=str,

From 9670c7a6787b7d8b03cee5e5232383587ed839ea Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 08:59:30 +0800
Subject: [PATCH 03/17] minor change

---
 graph_net_bench/torch/eval_backend_diff.py | 4 ++--
 graph_net_bench/torch/eval_backend_perf.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 50d17cb62..a5c02ec7b 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -252,7 +252,7 @@ def build_sub_args(env_ns: types.SimpleNamespace) -> argparse.Namespace:
     sub.op_lib = getattr(env_ns, "op_lib", None)
     sub.warmup = getattr(env_ns, "warmup", 3)
     sub.trials = getattr(env_ns, "trials", 5)
-    sub.log_prompt = getattr(env_ns, "log_prompt", None)
+    sub.log_prompt = getattr(env_ns, "log_prompt", "graph-net-bench-log")
     sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
     sub.backend_config = getattr(env_ns, "backend_config", None)
     return sub
@@ -275,7 +275,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Evaluate backend performance difference."
+        description="Evaluate Backend Performance Difference."
     )
     parser.add_argument(
         "--model-path",
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 7e12f6ebf..60194ae88 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -274,7 +274,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="GraphNet Backend Performance Evaluation"
+        description="Single Backend Performance Evaluation"
     )
     parser.add_argument(
         "--model-path",
@@ -316,7 +316,7 @@ def main(args):
         "--log-prompt",
         type=str,
         required=False,
-        default="graph-net-test-compiler-log",
+        default="graph-net-bench-log",
         help="Log prompt for performance log filtering.",
     )
     parser.add_argument(

From cb9a4f1e3cb7aff5a68b9f34083ba913faeb276f Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 09:06:35 +0800
Subject: [PATCH 04/17] Minor change on names

---
 graph_net_bench/torch/eval_backend_perf.py | 28 ++++++++++++----------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 60194ae88..c550767f0 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -66,28 +66,28 @@ def set_seed(random_seed):
         torch.cuda.manual_seed_all(random_seed)
 
 
-def get_hardward_name(args):
+def get_hardward_name(device):
     hardware_name = "unknown"
-    if "cuda" in args.device:
-        hardware_name = torch.cuda.get_device_name(args.device)
+    if "cuda" in device:
+        hardware_name = torch.cuda.get_device_name(device)
     elif args.device == "cpu":
         hardware_name = platform.processor()
     return hardware_name
 
 
-def get_compile_framework_version(args):
-    if args.compiler in ["inductor", "nope", "unstable_to_stable"]:
+def get_compiler_version(compiler):
+    if compiler in ["inductor", "nope", "unstable_to_stable"]:
         return torch.__version__
-    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
+    elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
         # Assuming compiler object has a version attribute
-        return f"{args.compiler.capitalize()} {args.compiler.version}"
+        return f"{compiler.capitalize()} {compiler.version}"
     return "unknown"
 
 
 def load_class_from_file(
-    args: argparse.Namespace, class_name: str, device: str
+    model_path: str, class_name: str, device: str
 ) -> Type[torch.nn.Module]:
-    file_path = f"{args.model_path}/model.py"
+    file_path = f"{model_path}/model.py"
     file = Path(file_path).resolve()
     module_name = file.stem
 
@@ -128,7 +128,9 @@ def get_model(args):
     device = "xla" if args.compiler == "xla" else args.device
 
     # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
-    model_class = load_class_from_file(args, class_name="GraphModule", device=device)
+    model_class = load_class_from_file(
+        args.model_path, class_name="GraphModule", device=device
+    )
     model = model_class().to(torch.device(args.device))
     return model
 
@@ -154,7 +156,7 @@ def measure_performance(model_call, args, compiler):
         model_call()
     compiler.synchronize()
 
-    hardware_name = get_hardward_name(args)
+    hardware_name = get_hardward_name(args.device)
     print(
         f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
         file=sys.stderr,
@@ -229,8 +231,8 @@ def eval_single_model(args):
 
             test_compiler_util.print_basic_config(
                 args,
-                get_hardward_name(args),
-                get_compile_framework_version(args),
+                get_hardward_name(args.device),
+                get_compiler_version(args.compiler),
             )
 
             test_compiler_util.print_with_log_prompt(

From 6b0975da11997a52ee4f8daf11215f6dd3b71564 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 14:41:35 +0800
Subject: [PATCH 05/17] use call method instead of bash

---
 graph_net_bench/torch/eval_backend_diff.py | 75 ++++++++++------------
 graph_net_bench/torch/eval_backend_perf.py |  4 +-
 2 files changed, 35 insertions(+), 44 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index a5c02ec7b..bb7811689 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -1,5 +1,4 @@
 from . import utils
-import subprocess
 import argparse
 import torch
 import sys
@@ -11,6 +10,7 @@
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
+from .eval_backend_perf import eval_single_model_with_single_backend
 
 
 def convert_to_dict(config_str):
@@ -153,23 +153,33 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
             file=sys.stderr,
             flush=True,
         )
-        cmd = " ".join(
-            [
-                sys.executable,
-                f"-m graph_net_bench.torch.{module_name}",
-                f"--model-path {model_path}",
-                f"--config {args.config}",
-            ]
-        )
+
         try:
-            process = subprocess.Popen(cmd, shell=True)
-            cmd_ret = process.wait()
+            single_model_args = argparse.Namespace()
+
+            single_model_args.model_path = model_path
+            single_model_args.config = args.config
+            single_model_args.model_path_list = None
+
+            if path_utils.is_single_model_dir(model_path):
+                eval_single_model(single_model_args)
+            else:
+                submodel_paths = path_utils.get_recursively_model_path(model_path)
+                for submodel_path in submodel_paths:
+                    sub_args = argparse.Namespace()
+                    sub_args.model_path = submodel_path
+                    sub_args.config = args.config
+                    sub_args.model_path_list = None
+                    eval_single_model(sub_args)
+            cmd_ret = 0
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
             sys.exit(1)
         except Exception:
             print("\n--- Full Traceback ---")
             traceback.print_exc()
+            cmd_ret = 1
+
         if cmd_ret != 0:
             failed_samples.append(model_path)
         sample_idx += 1
@@ -213,48 +223,29 @@ def eval_single_model(args):
         target_env=types.SimpleNamespace(**convert_to_dict(args.config)["target_env"]),
     )
 
-    ref_args = build_sub_args(EvalCfg.ref_env)
-    target_args = build_sub_args(EvalCfg.target_env)
+    ref_args = build_sub_args(EvalCfg.ref_env, args.model_path, ref_dir)
+    target_args = build_sub_args(EvalCfg.target_env, args.model_path, target_dir)
 
-    run_sub_process(ref_args, args.model_path, ref_dir)
-    run_sub_process(target_args, args.model_path, target_dir)
+    eval_single_model_with_single_backend(ref_args)
+    eval_single_model_with_single_backend(target_args)
     compare_perf_diff(ref_args, args.model_path, ref_dir, target_dir)
 
 
-def run_sub_process(env_args, model_path, output_path):
-    cmd = [sys.executable, "-m", "graph_net_bench.torch.eval_backend_perf"]
-    args_pairs = [
-        ("--model-path", model_path),
-        ("--output-path", output_path),
-        ("--seed", str(env_args.seed)),
-        ("--compiler", env_args.compiler),
-        ("--device", env_args.device),
-        ("--op-lib", env_args.op_lib),
-        ("--warmup", str(env_args.warmup)),
-        ("--trials", str(env_args.trials)),
-        ("--log-prompt", env_args.log_prompt),
-        ("--model-path-prefix", env_args.model_path_prefix),
-        ("--config", env_args.backend_config),
-    ]
-
-    for arg_name, arg_value in args_pairs:
-        if arg_value is not None:
-            cmd.extend([arg_name, arg_value])
-
-    subprocess.run(cmd, check=True)
-
-
-def build_sub_args(env_ns: types.SimpleNamespace) -> argparse.Namespace:
+def build_sub_args(
+    env_ns: types.SimpleNamespace, model_path: str, output_path: str
+) -> argparse.Namespace:
     sub = argparse.Namespace()
+    sub.model_path = model_path
+    sub.output_path = output_path
     sub.seed = getattr(env_ns, "seed", 123)
-    sub.compiler = getattr(env_ns, "compiler", None)
-    sub.device = getattr(env_ns, "device", None)
+    sub.compiler = getattr(env_ns, "compiler", "inductor")
+    sub.device = getattr(env_ns, "device", "cuda")
     sub.op_lib = getattr(env_ns, "op_lib", None)
     sub.warmup = getattr(env_ns, "warmup", 3)
     sub.trials = getattr(env_ns, "trials", 5)
     sub.log_prompt = getattr(env_ns, "log_prompt", "graph-net-bench-log")
     sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
-    sub.backend_config = getattr(env_ns, "backend_config", None)
+    sub.config = getattr(env_ns, "backend_config", None)
     return sub
 
 
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index c550767f0..d099ac7d9 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -211,7 +211,7 @@ def measure_performance(model_call, args, compiler):
     return outs, stats
 
 
-def eval_single_model(args):
+def eval_single_model_with_single_backend(args):
     log_path = utils.get_log_path(args.output_path, args.model_path)
     output_dump_path = utils.get_output_path(args.output_path, args.model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
@@ -271,7 +271,7 @@ def model_call():
 def main(args):
     set_seed(args.seed)
     os.makedirs(args.output_path, exist_ok=True)
-    eval_single_model(args)
+    eval_single_model_with_single_backend(args)
 
 
 if __name__ == "__main__":

From 980f7377973080b6bfd89250e7129cf3647a638a Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 14:46:21 +0800
Subject: [PATCH 06/17] minor change

---
 graph_net_bench/torch/eval_backend_perf.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index d099ac7d9..fcf313cef 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -212,6 +212,8 @@ def measure_performance(model_call, args, compiler):
 
 
 def eval_single_model_with_single_backend(args):
+    set_seed(args.seed)
+    os.makedirs(args.output_path, exist_ok=True)
     log_path = utils.get_log_path(args.output_path, args.model_path)
     output_dump_path = utils.get_output_path(args.output_path, args.model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
@@ -268,12 +270,6 @@ def model_call():
         print(content, file=sys.stderr, flush=True)
 
 
-def main(args):
-    set_seed(args.seed)
-    os.makedirs(args.output_path, exist_ok=True)
-    eval_single_model_with_single_backend(args)
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Single Backend Performance Evaluation"
@@ -336,4 +332,4 @@ def main(args):
         help="base64 encode configuration json.",
     )
     args = parser.parse_args()
-    main(args=args)
+    eval_single_model_with_single_backend(args=args)

From 5c49521d22fb87d4cf3f25549dd5ee2b76652262 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 14:55:08 +0800
Subject: [PATCH 07/17] change some names

---
 graph_net_bench/torch/eval_backend_diff.py | 16 ++++++++++------
 graph_net_bench/torch/eval_backend_perf.py |  8 +++++---
 test/eval_backend_diff_test.sh             |  8 ++++++--
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index bb7811689..0e7229086 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -219,12 +219,16 @@ def eval_single_model(args):
     target_dir = "/tmp/eval_perf_diff/B"
 
     EvalCfg = types.SimpleNamespace(
-        ref_env=types.SimpleNamespace(**convert_to_dict(args.config)["ref_env"]),
-        target_env=types.SimpleNamespace(**convert_to_dict(args.config)["target_env"]),
+        reference_config=types.SimpleNamespace(
+            **convert_to_dict(args.config)["reference_config"]
+        ),
+        target_config=types.SimpleNamespace(
+            **convert_to_dict(args.config)["target_config"]
+        ),
     )
 
-    ref_args = build_sub_args(EvalCfg.ref_env, args.model_path, ref_dir)
-    target_args = build_sub_args(EvalCfg.target_env, args.model_path, target_dir)
+    ref_args = build_sub_args(EvalCfg.reference_config, args.model_path, ref_dir)
+    target_args = build_sub_args(EvalCfg.target_config, args.model_path, target_dir)
 
     eval_single_model_with_single_backend(ref_args)
     eval_single_model_with_single_backend(target_args)
@@ -245,13 +249,13 @@ def build_sub_args(
     sub.trials = getattr(env_ns, "trials", 5)
     sub.log_prompt = getattr(env_ns, "log_prompt", "graph-net-bench-log")
     sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
-    sub.config = getattr(env_ns, "backend_config", None)
+    sub.backend_config = getattr(env_ns, "backend_config", None)
     return sub
 
 
 def main(args):
     config_dict = convert_to_dict(args.config)
-    model_path_prefix = config_dict.get("ref_env", {}).get("model_path_prefix")
+    model_path_prefix = config_dict.get("reference_config", {}).get("model_path_prefix")
 
     if args.model_path_list and model_path_prefix:
         eval_multi_models(args, model_path_prefix, use_model_list=True)
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index fcf313cef..29c40d3fb 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -120,8 +120,10 @@ def get_compiler_backend(args) -> GraphCompilerBackend:
         args.compiler in compiler_backend_name2class
     ), f"Unknown compiler: {args.compiler}"
     backend_class = compiler_backend_name2class[args.compiler]
-    config = convert_to_dict(args.config) if args.config is not None else {}
-    return backend_class(config)
+    backend_config = (
+        convert_to_dict(args.backend_config) if args.backend_config is not None else {}
+    )
+    return backend_class(backend_config)
 
 
 def get_model(args):
@@ -325,7 +327,7 @@ def model_call():
         help="Prefix path to model path list",
     )
     parser.add_argument(
-        "--config",
+        "--backend-config",
         type=str,
         required=False,
         default=None,
diff --git a/test/eval_backend_diff_test.sh b/test/eval_backend_diff_test.sh
index 16da81903..17bba712e 100755
--- a/test/eval_backend_diff_test.sh
+++ b/test/eval_backend_diff_test.sh
@@ -10,14 +10,18 @@ python3 -m graph_net_bench.torch.eval_backend_diff \
     --model-path-list $model_list \
     --config $(base64 -w 0 <<EOF
 {
-    "ref_env":  {
+    "reference_config":  {
         "compiler": "nope",
         "device": "cuda",
+        "warmup": 1,
+        "trials": 1,
         "model_path_prefix": "$AI4C_ROOT"
     },
-    "target_env": {
+    "target_config": {
         "compiler": "nope",
         "device": "cuda",
+        "warmup": 1,
+        "trials": 1,
         "model_path_prefix": "$AI4C_ROOT"
     }
 }

From bce359ecc37fe293d879a2fd9872d43a247e0227 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 15:51:53 +0800
Subject: [PATCH 08/17] Dynamically load backend class based on args.compiler

---
 graph_net_bench/torch/eval_backend_perf.py | 61 +++++++++-------------
 1 file changed, 24 insertions(+), 37 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 29c40d3fb..4d5ea94a5 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -13,41 +13,10 @@
 import platform
 import base64
 from contextlib import redirect_stdout, redirect_stderr
-
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
-from graph_net_bench.torch.backend.tvm_backend import TvmBackend
-from graph_net_bench.torch.backend.xla_backend import XlaBackend
-from graph_net_bench.torch.backend.inductor_backend import InductorBackend
-from graph_net_bench.torch.backend.tensorrt_backend import TensorRTBackend
-from graph_net_bench.torch.backend.blade_disc_backend import BladeDISCBackend
-from graph_net_bench.torch.backend.nope_backend import NopeBackend
-from graph_net_bench.torch.backend.pass_mgr_backend import PassMgrBackend
-from graph_net_bench.torch.backend.unstable_to_stable_backend import (
-    UnstableToStableBackend,
-)
-from graph_net_bench.torch.backend.range_decomposer_validator_backend import (
-    RangeDecomposerValidatorBackend,
-)
-from graph_net_bench.torch.backend.graph_variable_renamer_validator_backend import (
-    GraphVariableRenamerValidatorBackend,
-)
 from graph_net_bench import test_compiler_util
 
 
-compiler_backend_name2class = {
-    "tvm": TvmBackend,
-    "xla": XlaBackend,
-    "inductor": InductorBackend,
-    "tensorrt": TensorRTBackend,
-    "bladedisc": BladeDISCBackend,
-    "nope": NopeBackend,
-    "pass_mgr": PassMgrBackend,
-    "unstable_to_stable": UnstableToStableBackend,
-    "range_decomposer_validator": RangeDecomposerValidatorBackend,
-    "graph_variable_renamer_validator": GraphVariableRenamerValidatorBackend,
-}
-
-
 def register_op_lib(op_lib):
     if op_lib == "flaggems":
         import flag_gems
@@ -70,7 +39,7 @@ def get_hardward_name(device):
     hardware_name = "unknown"
     if "cuda" in device:
         hardware_name = torch.cuda.get_device_name(device)
-    elif args.device == "cpu":
+    elif device == "cpu":
         hardware_name = platform.processor()
     return hardware_name
 
@@ -116,10 +85,28 @@ def convert_to_dict(config_str):
 
 
 def get_compiler_backend(args) -> GraphCompilerBackend:
-    assert (
-        args.compiler in compiler_backend_name2class
-    ), f"Unknown compiler: {args.compiler}"
-    backend_class = compiler_backend_name2class[args.compiler]
+    """
+    Dynamically load backend class based on args.compiler
+    """
+    compiler_name = args.compiler.lower()
+    module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
+
+    try:
+        module = __import__(module_name, fromlist=[f"{compiler_name.title()}Backend"])
+
+        class_name = (
+            f"{''.join(part.title() for part in compiler_name.split('_'))}Backend"
+        )
+
+        backend_class = None
+        if hasattr(module, class_name):
+            backend_class = getattr(module, class_name)
+        else:
+            raise ImportError(f"No valid backend class found in {module_name}")
+
+    except ImportError as e:
+        raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
+
     backend_config = (
         convert_to_dict(args.backend_config) if args.backend_config is not None else {}
     )
@@ -327,7 +314,7 @@ def model_call():
         help="Prefix path to model path list",
     )
     parser.add_argument(
-        "--backend-config",
+        "--config",
         type=str,
         required=False,
         default=None,

From 8c2b1c3799020940836c34ace41350557b1e7ac8 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 16:28:01 +0800
Subject: [PATCH 09/17] Change argument passing to json config

---
 graph_net_bench/test_compiler_util.py      |  28 +++++
 graph_net_bench/torch/eval_backend_diff.py |  50 +++-----
 graph_net_bench/torch/eval_backend_perf.py | 131 +++++++--------------
 3 files changed, 88 insertions(+), 121 deletions(-)

diff --git a/graph_net_bench/test_compiler_util.py b/graph_net_bench/test_compiler_util.py
index f587da2ff..de38a29fa 100644
--- a/graph_net_bench/test_compiler_util.py
+++ b/graph_net_bench/test_compiler_util.py
@@ -5,6 +5,7 @@
 import time
 import subprocess
 import shutil
+import base64
 import numpy as np
 from dataclasses import dataclass
 from contextlib import contextmanager
@@ -156,6 +157,24 @@ def print_basic_config(args, hardware_name, compile_framework_version):
     )
 
 
+def print_config(model_path, config, hardware_name, compiler_version):
+    model_path = os.path.normpath(model_path)
+    model_name = get_model_name(model_path)
+    print_with_log_prompt("[Config] model:", model_name, config.log_prompt)
+    print_with_log_prompt("[Config] seed:", config.seed, config.log_prompt)
+    print_with_log_prompt("[Config] device:", config.device, config.log_prompt)
+    print_with_log_prompt("[Config] hardware:", hardware_name, config.log_prompt)
+    print_with_log_prompt("[Config] op_lib:", config.op_lib, config.log_prompt)
+    print_with_log_prompt("[Config] compiler:", config.compiler, config.log_prompt)
+    print_with_log_prompt("[Config] warmup:", config.warmup, config.log_prompt)
+    print_with_log_prompt("[Config] trials:", config.trials, config.log_prompt)
+    print_with_log_prompt(
+        "[Config] compile_framework_version:",
+        compiler_version,
+        config.log_prompt,
+    )
+
+
 def print_running_status(args, eager_success, compiled_success=None):
     def convert_to_str(b):
         return "success" if b else "failed"
@@ -353,3 +372,12 @@ def get_allow_samples(allow_list, model_path_prefix):
             test_samples.append(os.path.join(model_path_prefix, line.strip()))
 
     return test_samples
+
+
+def convert_to_dict(config_str):
+    if config_str in {None, "", "null", "None"}:
+        return {}
+    config_str = base64.b64decode(config_str).decode("utf-8")
+    config = json.loads(config_str)
+    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
+    return config
diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 0e7229086..6f8dc550b 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -6,22 +6,12 @@
 import os.path
 import traceback
 import json
-import base64
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
 from .eval_backend_perf import eval_single_model_with_single_backend
 
 
-def convert_to_dict(config_str):
-    if config_str in {None, "", "null", "None"}:
-        return {}
-    config_str = base64.b64decode(config_str).decode("utf-8")
-    config = json.loads(config_str)
-    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
-    return config
-
-
 def compare_correctness(expected_out, compiled_out, args):
     eager_dtypes = [
         (
@@ -220,41 +210,37 @@ def eval_single_model(args):
 
     EvalCfg = types.SimpleNamespace(
         reference_config=types.SimpleNamespace(
-            **convert_to_dict(args.config)["reference_config"]
+            **test_compiler_util.convert_to_dict(args.config)["reference_config"]
         ),
         target_config=types.SimpleNamespace(
-            **convert_to_dict(args.config)["target_config"]
+            **test_compiler_util.convert_to_dict(args.config)["target_config"]
         ),
     )
 
-    ref_args = build_sub_args(EvalCfg.reference_config, args.model_path, ref_dir)
-    target_args = build_sub_args(EvalCfg.target_config, args.model_path, target_dir)
+    reference_config = build_sub_config(EvalCfg.reference_config)
+    target_config = build_sub_config(EvalCfg.target_config)
 
-    eval_single_model_with_single_backend(ref_args)
-    eval_single_model_with_single_backend(target_args)
-    compare_perf_diff(ref_args, args.model_path, ref_dir, target_dir)
+    eval_single_model_with_single_backend(args.model_path, ref_dir, reference_config)
+    eval_single_model_with_single_backend(args.model_path, target_dir, target_config)
+    compare_perf_diff(reference_config, args.model_path, ref_dir, target_dir)
 
 
-def build_sub_args(
-    env_ns: types.SimpleNamespace, model_path: str, output_path: str
-) -> argparse.Namespace:
+def build_sub_config(config):
     sub = argparse.Namespace()
-    sub.model_path = model_path
-    sub.output_path = output_path
-    sub.seed = getattr(env_ns, "seed", 123)
-    sub.compiler = getattr(env_ns, "compiler", "inductor")
-    sub.device = getattr(env_ns, "device", "cuda")
-    sub.op_lib = getattr(env_ns, "op_lib", None)
-    sub.warmup = getattr(env_ns, "warmup", 3)
-    sub.trials = getattr(env_ns, "trials", 5)
-    sub.log_prompt = getattr(env_ns, "log_prompt", "graph-net-bench-log")
-    sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
-    sub.backend_config = getattr(env_ns, "backend_config", None)
+    sub.seed = getattr(config, "seed", 123)
+    sub.compiler = getattr(config, "compiler", "inductor")
+    sub.device = getattr(config, "device", "cuda")
+    sub.op_lib = getattr(config, "op_lib", None)
+    sub.warmup = getattr(config, "warmup", 3)
+    sub.trials = getattr(config, "trials", 5)
+    sub.log_prompt = getattr(config, "log_prompt", "graph-net-bench-log")
+    sub.model_path_prefix = getattr(config, "model_path_prefix", None)
+    sub.backend_config = getattr(config, "backend_config", None)
     return sub
 
 
 def main(args):
-    config_dict = convert_to_dict(args.config)
+    config_dict = test_compiler_util.convert_to_dict(args.config)
     model_path_prefix = config_dict.get("reference_config", {}).get("model_path_prefix")
 
     if args.model_path_list and model_path_prefix:
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 4d5ea94a5..3774d4176 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -11,7 +11,6 @@
 import random
 import numpy as np
 import platform
-import base64
 from contextlib import redirect_stdout, redirect_stderr
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 from graph_net_bench import test_compiler_util
@@ -75,20 +74,11 @@ def load_class_from_file(
     return model_class
 
 
-def convert_to_dict(config_str):
-    if config_str is None or config_str == "None":
-        return {}
-    config_str = base64.b64decode(config_str).decode("utf-8")
-    config = json.loads(config_str)
-    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
-    return config
-
-
-def get_compiler_backend(args) -> GraphCompilerBackend:
+def get_compiler_backend(config) -> GraphCompilerBackend:
     """
-    Dynamically load backend class based on args.compiler
+    Dynamically load backend class based on config.compiler
     """
-    compiler_name = args.compiler.lower()
+    compiler_name = config.compiler.lower()
     module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
 
     try:
@@ -108,56 +98,58 @@ def get_compiler_backend(args) -> GraphCompilerBackend:
         raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
 
     backend_config = (
-        convert_to_dict(args.backend_config) if args.backend_config is not None else {}
+        test_compiler_util.convert_to_dict(config.backend_config)
+        if config.backend_config is not None
+        else {}
     )
     return backend_class(backend_config)
 
 
-def get_model(args):
-    device = "xla" if args.compiler == "xla" else args.device
+def get_model(model_path, config):
+    device = "xla" if config.compiler == "xla" else config.device
 
     # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
     model_class = load_class_from_file(
-        args.model_path, class_name="GraphModule", device=device
+        model_path, class_name="GraphModule", device=device
     )
-    model = model_class().to(torch.device(args.device))
+    model = model_class().to(torch.device(config.device))
     return model
 
 
-def get_input_dict(args):
-    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
+def get_input_dict(model_path, config):
+    inputs_params = utils.load_converted_from_text(f"{model_path}")
     params = inputs_params["weight_info"]
     for tensor_meta in params.values():
         if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = args.device
+            tensor_meta["info"]["device"] = config.device
     return {
-        k: utils.replay_tensor(v).to(torch.device(args.device))
+        k: utils.replay_tensor(v).to(torch.device(config.device))
         for k, v in params.items()
     }
 
 
-def measure_performance(model_call, args, compiler):
+def measure_performance(model_call, config, compiler):
     stats = {}
     outs = model_call()
 
     # Warmup runs
-    for _ in range(args.warmup):
+    for _ in range(config.warmup):
         model_call()
     compiler.synchronize()
 
-    hardware_name = get_hardward_name(args.device)
+    hardware_name = get_hardward_name(config.device)
     print(
-        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
+        f"[Profiling] Using device: {config.device} {hardware_name}, warm up {config.warmup}, trials {config.trials}",
         file=sys.stderr,
         flush=True,
     )
 
-    if "cuda" in args.device:
+    if "cuda" in config.device:
         torch.cuda.empty_cache()
         e2e_times = []
         gpu_times = []
 
-        for i in range(args.trials):
+        for i in range(config.trials):
             # End-to-end timing (naive_timer)
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
@@ -185,7 +177,7 @@ def measure_performance(model_call, args, compiler):
 
     else:  # CPU or other devices
         e2e_times = []
-        for i in range(args.trials):
+        for i in range(config.trials):
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 model_call()
@@ -200,34 +192,27 @@ def measure_performance(model_call, args, compiler):
     return outs, stats
 
 
-def eval_single_model_with_single_backend(args):
-    set_seed(args.seed)
-    os.makedirs(args.output_path, exist_ok=True)
-    log_path = utils.get_log_path(args.output_path, args.model_path)
-    output_dump_path = utils.get_output_path(args.output_path, args.model_path)
+def eval_single_model_with_single_backend(model_path, output_path, config):
+    set_seed(config.seed)
+    os.makedirs(output_path, exist_ok=True)
+    log_path = utils.get_log_path(output_path, model_path)
+    output_dump_path = utils.get_output_path(output_path, model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
     print(f"Outputs path: {output_dump_path}", file=sys.stderr, flush=True)
 
     with open(log_path, "w", encoding="utf-8") as log_f:
         with redirect_stdout(log_f), redirect_stderr(log_f):
-            compiler = get_compiler_backend(args)
+            compiler = get_compiler_backend(config)
 
-            input_dict = get_input_dict(args)
-            model = get_model(args)
+            input_dict = get_input_dict(model_path, config)
+            model = get_model(model_path, config)
             model.eval()
 
-            test_compiler_util.print_with_log_prompt(
-                "[Config] seed:", args.seed, args.log_prompt
-            )
-
-            test_compiler_util.print_basic_config(
-                args,
-                get_hardward_name(args.device),
-                get_compiler_version(args.compiler),
-            )
-
-            test_compiler_util.print_with_log_prompt(
-                "[Config] op_lib:", args.op_lib, args.log_prompt
+            test_compiler_util.print_config(
+                model_path,
+                config,
+                get_hardward_name(config.device),
+                get_compiler_version(config.compiler),
             )
 
             success = False
@@ -238,7 +223,7 @@ def eval_single_model_with_single_backend(args):
                 def model_call():
                     return compiled_model(**input_dict)
 
-                outputs, time_stats = measure_performance(model_call, args, compiler)
+                outputs, time_stats = measure_performance(model_call, config, compiler)
                 success = True
             except Exception as e:
                 print(
@@ -247,11 +232,11 @@ def model_call():
                     flush=True,
                 )
 
-            test_compiler_util.print_running_status(args, success)
+            test_compiler_util.print_running_status(config, success)
             if success:
                 torch.save(outputs, str(output_dump_path))
             test_compiler_util.print_with_log_prompt(
-                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
+                "[Performance][eager]:", json.dumps(time_stats), config.log_prompt
             )
 
     with open(log_path, "r", encoding="utf-8") as f:
@@ -277,42 +262,6 @@ def model_call():
         default="/tmp/test_save",
         help="Path to save outputs",
     )
-    parser.add_argument("--seed", type=int, required=False, default=123)
-    parser.add_argument(
-        "--compiler",
-        type=str,
-        required=False,
-        default="inductor",
-        help="Path to customized compiler python file",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        required=False,
-        default="cuda",
-        help="Device for testing the compiler (e.g., 'cpu' or 'cuda')",
-    )
-    parser.add_argument("--op-lib", type=str, required=False, default=None)
-    parser.add_argument(
-        "--warmup", type=int, required=False, default=3, help="Number of warmup steps"
-    )
-    parser.add_argument(
-        "--trials", type=int, required=False, default=5, help="Number of timing trials"
-    )
-    parser.add_argument(
-        "--log-prompt",
-        type=str,
-        required=False,
-        default="graph-net-bench-log",
-        help="Log prompt for performance log filtering.",
-    )
-    parser.add_argument(
-        "--model-path-prefix",
-        type=str,
-        required=False,
-        default=None,
-        help="Prefix path to model path list",
-    )
     parser.add_argument(
         "--config",
         type=str,
@@ -321,4 +270,8 @@ def model_call():
         help="base64 encode configuration json.",
     )
     args = parser.parse_args()
-    eval_single_model_with_single_backend(args=args)
+    eval_single_model_with_single_backend(
+        args.model_path,
+        args.output_path,
+        **test_compiler_util.convert_to_dict(args.config),
+    )

From db877bdb37c4cf20ca29414340d6a5707f072be9 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 18:11:20 +0800
Subject: [PATCH 10/17] Add check_and_complete_args

---
 graph_net_bench/test_compiler_util.py      |  22 ++---
 graph_net_bench/torch/eval_backend_diff.py |  73 ++++++---------
 graph_net_bench/torch/eval_backend_perf.py | 100 +++++++++++++--------
 3 files changed, 101 insertions(+), 94 deletions(-)

diff --git a/graph_net_bench/test_compiler_util.py b/graph_net_bench/test_compiler_util.py
index de38a29fa..44ccc703e 100644
--- a/graph_net_bench/test_compiler_util.py
+++ b/graph_net_bench/test_compiler_util.py
@@ -157,21 +157,21 @@ def print_basic_config(args, hardware_name, compile_framework_version):
     )
 
 
-def print_config(model_path, config, hardware_name, compiler_version):
-    model_path = os.path.normpath(model_path)
+def print_config(args, hardware_name, compiler_version):
+    model_path = os.path.normpath(args.model_path)
     model_name = get_model_name(model_path)
-    print_with_log_prompt("[Config] model:", model_name, config.log_prompt)
-    print_with_log_prompt("[Config] seed:", config.seed, config.log_prompt)
-    print_with_log_prompt("[Config] device:", config.device, config.log_prompt)
-    print_with_log_prompt("[Config] hardware:", hardware_name, config.log_prompt)
-    print_with_log_prompt("[Config] op_lib:", config.op_lib, config.log_prompt)
-    print_with_log_prompt("[Config] compiler:", config.compiler, config.log_prompt)
-    print_with_log_prompt("[Config] warmup:", config.warmup, config.log_prompt)
-    print_with_log_prompt("[Config] trials:", config.trials, config.log_prompt)
+    print_with_log_prompt("[Config] model:", model_name, args.log_prompt)
+    print_with_log_prompt("[Config] seed:", args.seed, args.log_prompt)
+    print_with_log_prompt("[Config] device:", args.device, args.log_prompt)
+    print_with_log_prompt("[Config] hardware:", hardware_name, args.log_prompt)
+    print_with_log_prompt("[Config] op_lib:", args.op_lib, args.log_prompt)
+    print_with_log_prompt("[Config] compiler:", args.compiler, args.log_prompt)
+    print_with_log_prompt("[Config] warmup:", args.warmup, args.log_prompt)
+    print_with_log_prompt("[Config] trials:", args.trials, args.log_prompt)
     print_with_log_prompt(
         "[Config] compile_framework_version:",
         compiler_version,
-        config.log_prompt,
+        args.log_prompt,
     )
 
 
diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 6f8dc550b..c230f6bd8 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -146,10 +146,9 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
 
         try:
             single_model_args = argparse.Namespace()
-
             single_model_args.model_path = model_path
-            single_model_args.config = args.config
             single_model_args.model_path_list = None
+            single_model_args.config = args.config
 
             if path_utils.is_single_model_dir(model_path):
                 eval_single_model(single_model_args)
@@ -158,8 +157,8 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
                 for submodel_path in submodel_paths:
                     sub_args = argparse.Namespace()
                     sub_args.model_path = submodel_path
-                    sub_args.config = args.config
                     sub_args.model_path_list = None
+                    sub_args.config = args.config
                     eval_single_model(sub_args)
             cmd_ret = 0
         except KeyboardInterrupt:
@@ -184,60 +183,44 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
             print(f"- {model_path}", file=sys.stderr, flush=True)
 
 
-def compare_perf_diff(args, model_path, ref_dir, target_dir):
+def eval_single_model(args):
+    ref_dir = "/tmp/eval_perf_diff/A"
+    target_dir = "/tmp/eval_perf_diff/B"
+
+    ref_args = types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=ref_dir,
+        **test_compiler_util.convert_to_dict(args.config)["reference_config"],
+    )
+    target_args = types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=target_dir,
+        **test_compiler_util.convert_to_dict(args.config)["target_config"],
+    )
+
+    eval_single_model_with_single_backend(ref_args)
+    eval_single_model_with_single_backend(target_args)
+
+    # compare_perf_diff
     # A
-    ref_dump_path = utils.get_output_path(ref_dir, model_path)
+    ref_dump_path = utils.get_output_path(ref_dir, args.model_path)
     ref_out = torch.load(str(ref_dump_path))
 
-    ref_log_path = utils.get_log_path(ref_dir, model_path)
+    ref_log_path = utils.get_log_path(ref_dir, args.model_path)
     ref_time_stats = parse_time_stats_from_reference_log(ref_log_path)
 
     # B
-    target_dump_path = utils.get_output_path(target_dir, model_path)
+    target_dump_path = utils.get_output_path(target_dir, args.model_path)
     target_out = torch.load(str(target_dump_path))
 
-    target_log_path = utils.get_log_path(target_dir, model_path)
+    target_log_path = utils.get_log_path(target_dir, args.model_path)
     target_time_stats = parse_time_stats_from_reference_log(target_log_path)
 
-    compare_correctness(ref_out, target_out, args)
-
-    test_compiler_util.print_times_and_speedup(args, ref_time_stats, target_time_stats)
-
-
-def eval_single_model(args):
-    ref_dir = "/tmp/eval_perf_diff/A"
-    target_dir = "/tmp/eval_perf_diff/B"
-
-    EvalCfg = types.SimpleNamespace(
-        reference_config=types.SimpleNamespace(
-            **test_compiler_util.convert_to_dict(args.config)["reference_config"]
-        ),
-        target_config=types.SimpleNamespace(
-            **test_compiler_util.convert_to_dict(args.config)["target_config"]
-        ),
+    compare_correctness(ref_out, target_out, ref_args)
+    test_compiler_util.print_times_and_speedup(
+        ref_args, ref_time_stats, target_time_stats
     )
 
-    reference_config = build_sub_config(EvalCfg.reference_config)
-    target_config = build_sub_config(EvalCfg.target_config)
-
-    eval_single_model_with_single_backend(args.model_path, ref_dir, reference_config)
-    eval_single_model_with_single_backend(args.model_path, target_dir, target_config)
-    compare_perf_diff(reference_config, args.model_path, ref_dir, target_dir)
-
-
-def build_sub_config(config):
-    sub = argparse.Namespace()
-    sub.seed = getattr(config, "seed", 123)
-    sub.compiler = getattr(config, "compiler", "inductor")
-    sub.device = getattr(config, "device", "cuda")
-    sub.op_lib = getattr(config, "op_lib", None)
-    sub.warmup = getattr(config, "warmup", 3)
-    sub.trials = getattr(config, "trials", 5)
-    sub.log_prompt = getattr(config, "log_prompt", "graph-net-bench-log")
-    sub.model_path_prefix = getattr(config, "model_path_prefix", None)
-    sub.backend_config = getattr(config, "backend_config", None)
-    return sub
-
 
 def main(args):
     config_dict = test_compiler_util.convert_to_dict(args.config)
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 3774d4176..5c8586f30 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -11,6 +11,7 @@
 import random
 import numpy as np
 import platform
+import types
 from contextlib import redirect_stdout, redirect_stderr
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 from graph_net_bench import test_compiler_util
@@ -74,11 +75,11 @@ def load_class_from_file(
     return model_class
 
 
-def get_compiler_backend(config) -> GraphCompilerBackend:
+def get_compiler_backend(args) -> GraphCompilerBackend:
     """
-    Dynamically load backend class based on config.compiler
+    Dynamically load backend class based on args.compiler
     """
-    compiler_name = config.compiler.lower()
+    compiler_name = args.compiler.lower()
     module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
 
     try:
@@ -98,58 +99,57 @@ def get_compiler_backend(config) -> GraphCompilerBackend:
         raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
 
     backend_config = (
-        test_compiler_util.convert_to_dict(config.backend_config)
-        if config.backend_config is not None
+        test_compiler_util.convert_to_dict(args.backend_config)
+        if args.backend_config is not None
         else {}
     )
     return backend_class(backend_config)
 
 
-def get_model(model_path, config):
-    device = "xla" if config.compiler == "xla" else config.device
+def get_model(args):
+    device = "xla" if args.compiler == "xla" else args.device
 
     # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
     model_class = load_class_from_file(
-        model_path, class_name="GraphModule", device=device
+        args.model_path, class_name="GraphModule", device=device
     )
-    model = model_class().to(torch.device(config.device))
+    model = model_class().to(torch.device(args.device))
     return model
 
 
-def get_input_dict(model_path, config):
-    inputs_params = utils.load_converted_from_text(f"{model_path}")
+def get_input_dict(args):
+    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
     params = inputs_params["weight_info"]
     for tensor_meta in params.values():
         if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = config.device
+            tensor_meta["info"]["device"] = args.device
     return {
-        k: utils.replay_tensor(v).to(torch.device(config.device))
+        k: utils.replay_tensor(v).to(torch.device(args.device))
         for k, v in params.items()
     }
 
 
-def measure_performance(model_call, config, compiler):
+def measure_performance(model_call, args, compiler):
     stats = {}
     outs = model_call()
 
     # Warmup runs
-    for _ in range(config.warmup):
+    for _ in range(args.warmup):
         model_call()
     compiler.synchronize()
 
-    hardware_name = get_hardward_name(config.device)
     print(
-        f"[Profiling] Using device: {config.device} {hardware_name}, warm up {config.warmup}, trials {config.trials}",
+        f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
         file=sys.stderr,
         flush=True,
     )
 
-    if "cuda" in config.device:
+    if "cuda" in args.device:
         torch.cuda.empty_cache()
         e2e_times = []
         gpu_times = []
 
-        for i in range(config.trials):
+        for i in range(args.trials):
             # End-to-end timing (naive_timer)
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
@@ -177,7 +177,7 @@ def measure_performance(model_call, config, compiler):
 
     else:  # CPU or other devices
         e2e_times = []
-        for i in range(config.trials):
+        for i in range(args.trials):
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 model_call()
@@ -192,27 +192,27 @@ def measure_performance(model_call, config, compiler):
     return outs, stats
 
 
-def eval_single_model_with_single_backend(model_path, output_path, config):
-    set_seed(config.seed)
-    os.makedirs(output_path, exist_ok=True)
-    log_path = utils.get_log_path(output_path, model_path)
-    output_dump_path = utils.get_output_path(output_path, model_path)
+def eval_single_model_with_single_backend(args):
+    check_and_complete_args(args)
+    set_seed(args.seed)
+    os.makedirs(args.output_path, exist_ok=True)
+    log_path = utils.get_log_path(args.output_path, args.model_path)
+    output_dump_path = utils.get_output_path(args.output_path, args.model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
     print(f"Outputs path: {output_dump_path}", file=sys.stderr, flush=True)
 
     with open(log_path, "w", encoding="utf-8") as log_f:
         with redirect_stdout(log_f), redirect_stderr(log_f):
-            compiler = get_compiler_backend(config)
+            compiler = get_compiler_backend(args)
 
-            input_dict = get_input_dict(model_path, config)
-            model = get_model(model_path, config)
+            input_dict = get_input_dict(args)
+            model = get_model(args)
             model.eval()
 
             test_compiler_util.print_config(
-                model_path,
-                config,
-                get_hardward_name(config.device),
-                get_compiler_version(config.compiler),
+                args,
+                get_hardward_name(args.device),
+                get_compiler_version(args.compiler),
             )
 
             success = False
@@ -223,7 +223,7 @@ def eval_single_model_with_single_backend(model_path, output_path, config):
                 def model_call():
                     return compiled_model(**input_dict)
 
-                outputs, time_stats = measure_performance(model_call, config, compiler)
+                outputs, time_stats = measure_performance(model_call, args, compiler)
                 success = True
             except Exception as e:
                 print(
@@ -232,11 +232,11 @@ def model_call():
                     flush=True,
                 )
 
-            test_compiler_util.print_running_status(config, success)
+            test_compiler_util.print_running_status(args, success)
             if success:
                 torch.save(outputs, str(output_dump_path))
             test_compiler_util.print_with_log_prompt(
-                "[Performance][eager]:", json.dumps(time_stats), config.log_prompt
+                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
             )
 
     with open(log_path, "r", encoding="utf-8") as f:
@@ -244,6 +244,29 @@ def model_call():
         print(content, file=sys.stderr, flush=True)
 
 
+def check_and_complete_args(args):
+    """
+    Ensure all required arguments are present with default values if missing
+    """
+    defaults = {
+        "model_path": None,  # Model path
+        "output_path": None,  # Log and output directory
+        "seed": 123,  # Random seed
+        "compiler": "inductor",  # Compiler name
+        "device": "cuda",  # Device for testing the compiler (e.g., 'cpu' or 'cuda')
+        "op_lib": None,  # Operator library
+        "warmup": 3,  # Number of warmup steps
+        "trials": 5,  # Number of timing trials
+        "log_prompt": "graph-net-bench-log",  # Log prompt for performance log filtering
+        "model_path_prefix": None,  # Prefix path to model path in args.model-path
+        "backend_config": None,  # backend configuration json
+    }
+
+    for key, default in defaults.items():
+        if not hasattr(args, key):
+            setattr(args, key, default)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Single Backend Performance Evaluation"
@@ -270,8 +293,9 @@ def model_call():
         help="base64 encode configuration json.",
     )
     args = parser.parse_args()
-    eval_single_model_with_single_backend(
-        args.model_path,
-        args.output_path,
+    mut_args = types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=args.output_path,
         **test_compiler_util.convert_to_dict(args.config),
     )
+    eval_single_model_with_single_backend(mut_args)

From 0e6ec45faf2fe026640c7535e0ed4d2e567dfe02 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 18:25:24 +0800
Subject: [PATCH 11/17] Simplify

---
 graph_net_bench/torch/eval_backend_diff.py | 78 +++++++++++-----------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index c230f6bd8..ecafb71ae 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -109,72 +109,74 @@ def parse_time_stats_from_reference_log(log_path):
     return time_stats
 
 
-def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
-    sample_idx = 0
-    failed_samples = []
-    module_name = os.path.splitext(os.path.basename(__file__))[0]
-
+def _get_model_paths(args, model_path_prefix, use_model_list):
     if use_model_list:
-        assert os.path.isdir(model_path_prefix)
-        assert os.path.isfile(args.model_path_list)
+        assert os.path.isdir(model_path_prefix) and os.path.isfile(args.model_path_list)
+
         test_samples = test_compiler_util.get_allow_samples(
             args.model_path_list, model_path_prefix
         )
-        model_paths = []
-        for rel_model_path in test_samples:
-            model_path = os.path.join(model_path_prefix, rel_model_path)
-            if os.path.exists(model_path) and os.path.exists(
-                os.path.join(model_path, "model.py")
-            ):
-                model_paths.append(model_path)
+        model_paths = [
+            os.path.join(model_path_prefix, rel_model_path)
+            for rel_model_path in test_samples
+            if os.path.exists(
+                os.path.join(model_path_prefix, rel_model_path, "model.py")
+            )
+        ]
     else:
         assert os.path.isdir(args.model_path)
+
         test_samples = test_compiler_util.get_allow_samples(
             args.model_path_list, model_path_prefix
         )
-        model_paths = []
-        for model_path in path_utils.get_recursively_model_path(args.model_path):
-            if test_samples is None or os.path.abspath(model_path) in test_samples:
-                model_paths.append(model_path)
+        model_paths = [
+            model_path
+            for model_path in path_utils.get_recursively_model_path(args.model_path)
+            if test_samples is None or os.path.abspath(model_path) in test_samples
+        ]
+
+    return model_paths
+
 
-    for model_path in model_paths:
+def _create_model_args(model_path, config):
+    args = argparse.Namespace()
+    args.model_path = model_path
+    args.model_path_list = None
+    args.config = config
+    return args
+
+
+def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
+    module_name = os.path.splitext(os.path.basename(__file__))[0]
+
+    model_paths = _get_model_paths(args, model_path_prefix, use_model_list)
+    failed_samples = []
+    for sample_idx, model_path in enumerate(model_paths):
         print(
             f"[{sample_idx}] {module_name}, model_path: {model_path}",
             file=sys.stderr,
             flush=True,
         )
-
         try:
-            single_model_args = argparse.Namespace()
-            single_model_args.model_path = model_path
-            single_model_args.model_path_list = None
-            single_model_args.config = args.config
-
             if path_utils.is_single_model_dir(model_path):
-                eval_single_model(single_model_args)
+                eval_single_model(_create_model_args(model_path, args.config))
             else:
-                submodel_paths = path_utils.get_recursively_model_path(model_path)
-                for submodel_path in submodel_paths:
-                    sub_args = argparse.Namespace()
-                    sub_args.model_path = submodel_path
-                    sub_args.model_path_list = None
-                    sub_args.config = args.config
-                    eval_single_model(sub_args)
-            cmd_ret = 0
+                for submodel_path in path_utils.get_recursively_model_path(model_path):
+                    eval_single_model(_create_model_args(submodel_path, args.config))
+            success = True
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
             sys.exit(1)
         except Exception:
             print("\n--- Full Traceback ---")
             traceback.print_exc()
-            cmd_ret = 1
+            success = False
 
-        if cmd_ret != 0:
+        if not success:
             failed_samples.append(model_path)
-        sample_idx += 1
 
     print(
-        f"Totally {sample_idx} verified samples, failed {len(failed_samples)} samples.",
+        f"Totally {len(model_paths)} verified samples, failed {len(failed_samples)} samples.",
         file=sys.stderr,
         flush=True,
     )

From a5fa17369258592c16abdd0ef69a47a92c5f677c Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 13:57:16 +0800
Subject: [PATCH 12/17] modify args.config to separate args.reference_config
 and args.target_config

---
 graph_net_bench/torch/eval_backend_diff.py | 39 +++++++++++++---------
 test/eval_backend_diff_test.sh             | 31 ++++++++---------
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index ecafb71ae..c254eafaf 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -138,11 +138,12 @@ def _get_model_paths(args, model_path_prefix, use_model_list):
     return model_paths
 
 
-def _create_model_args(model_path, config):
+def _create_model_args(model_path, reference_config, target_config):
     args = argparse.Namespace()
     args.model_path = model_path
     args.model_path_list = None
-    args.config = config
+    args.reference_config = reference_config
+    args.target_config = target_config
     return args
 
 
@@ -157,12 +158,15 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
             file=sys.stderr,
             flush=True,
         )
+
+        model_args = argparse.Namespace()
+        model_args.model_path = model_path
+        model_args.model_path_list = None
+        model_args.reference_config = args.reference_config
+        model_args.target_config = args.target_config
+
         try:
-            if path_utils.is_single_model_dir(model_path):
-                eval_single_model(_create_model_args(model_path, args.config))
-            else:
-                for submodel_path in path_utils.get_recursively_model_path(model_path):
-                    eval_single_model(_create_model_args(submodel_path, args.config))
+            eval_single_model(model_args)
             success = True
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
@@ -192,12 +196,12 @@ def eval_single_model(args):
     ref_args = types.SimpleNamespace(
         model_path=args.model_path,
         output_path=ref_dir,
-        **test_compiler_util.convert_to_dict(args.config)["reference_config"],
+        **test_compiler_util.convert_to_dict(args.reference_config),
     )
     target_args = types.SimpleNamespace(
         model_path=args.model_path,
         output_path=target_dir,
-        **test_compiler_util.convert_to_dict(args.config)["target_config"],
+        **test_compiler_util.convert_to_dict(args.target_config),
     )
 
     eval_single_model_with_single_backend(ref_args)
@@ -225,8 +229,8 @@ def eval_single_model(args):
 
 
 def main(args):
-    config_dict = test_compiler_util.convert_to_dict(args.config)
-    model_path_prefix = config_dict.get("reference_config", {}).get("model_path_prefix")
+    ref_config = test_compiler_util.convert_to_dict(args.reference_config)
+    model_path_prefix = ref_config.get("model_path_prefix")
 
     if args.model_path_list and model_path_prefix:
         eval_multi_models(args, model_path_prefix, use_model_list=True)
@@ -258,11 +262,16 @@ def main(args):
         help="Path to samples list, each line contains a sample path",
     )
     parser.add_argument(
-        "--config",
+        "--reference-config",
         type=str,
-        required=False,
-        default=None,
-        help="base64 encode configuration json.",
+        required=True,
+        help="base64 encode reference config json.",
+    )
+    parser.add_argument(
+        "--target-config",
+        type=str,
+        required=True,
+        help="base64 encode target config json.",
     )
     args = parser.parse_args()
     main(args=args)
diff --git a/test/eval_backend_diff_test.sh b/test/eval_backend_diff_test.sh
index 17bba712e..1eaca5ecd 100755
--- a/test/eval_backend_diff_test.sh
+++ b/test/eval_backend_diff_test.sh
@@ -8,22 +8,23 @@ model_list="$AI4C_ROOT/test/workspace_eval_backend_diff/sample_list.txt"
 
 python3 -m graph_net_bench.torch.eval_backend_diff \
     --model-path-list $model_list \
-    --config $(base64 -w 0 <<EOF
+    --reference-config $(base64 -w 0 <<EOF
 {
-    "reference_config":  {
-        "compiler": "nope",
-        "device": "cuda",
-        "warmup": 1,
-        "trials": 1,
-        "model_path_prefix": "$AI4C_ROOT"
-    },
-    "target_config": {
-        "compiler": "nope",
-        "device": "cuda",
-        "warmup": 1,
-        "trials": 1,
-        "model_path_prefix": "$AI4C_ROOT"
-    }
+    "compiler": "nope",
+    "device": "cuda",
+    "warmup": 1,
+    "trials": 1,
+    "model_path_prefix": "$AI4C_ROOT"
+}
+EOF
+) \
+    --target-config $(base64 -w 0 <<EOF
+{
+    "compiler": "nope",
+    "device": "cuda",
+    "warmup": 1,
+    "trials": 1,
+    "model_path_prefix": "$AI4C_ROOT"
 }
 EOF
 ) 2>&1 | tee "$OUTPUT_PATH/validation.log"

From 0c9e07b8d93e9f1aba28569a86995edab583b383 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 14:24:51 +0800
Subject: [PATCH 13/17] reuse some code

---
 graph_net/torch/test_reference_device.py | 26 ++++--------------------
 graph_net/torch/test_target_device.py    | 22 +++++++-------------
 2 files changed, 11 insertions(+), 37 deletions(-)

diff --git a/graph_net/torch/test_reference_device.py b/graph_net/torch/test_reference_device.py
index f022d2ba5..33d0ec8e4 100644
--- a/graph_net/torch/test_reference_device.py
+++ b/graph_net/torch/test_reference_device.py
@@ -11,30 +11,12 @@
 from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
 from graph_net_bench.torch import test_compiler
-
-
-def get_reference_log_path(reference_dir, model_path):
-    model_name = model_path.split("torch_samples/")[-1].replace(os.sep, "_")
-    return os.path.join(reference_dir, f"{model_name}.log")
-
-
-def get_reference_output_path(reference_dir, model_path):
-    model_name = model_path.split("torch_samples/")[-1].replace(os.sep, "_")
-    return os.path.join(reference_dir, f"{model_name}.pth")
-
-
-def register_op_lib(op_lib):
-    if op_lib == "flaggems":
-        import flag_gems
-
-        flag_gems.enable()
-    else:
-        pass
+from graph_net_bench.torch import utils, eval_backend_perf
 
 
 def test_single_model(args):
-    ref_log = get_reference_log_path(args.reference_dir, args.model_path)
-    ref_dump = get_reference_output_path(args.reference_dir, args.model_path)
+    ref_log = utils.get_log_path(args.reference_dir, args.model_path)
+    ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
     print(f"Reference log path: {ref_log}", file=sys.stderr, flush=True)
     print(f"Reference outputs path: {ref_dump}", file=sys.stderr, flush=True)
 
@@ -149,7 +131,7 @@ def main(args):
     ref_dump_dir.mkdir(parents=True, exist_ok=True)
 
     if path_utils.is_single_model_dir(args.model_path):
-        register_op_lib(args.op_lib)
+        eval_backend_perf.register_op_lib(args.op_lib)
         test_single_model(args)
     else:
         test_multi_models(args)
diff --git a/graph_net/torch/test_target_device.py b/graph_net/torch/test_target_device.py
index ec2085a32..cf56dee69 100644
--- a/graph_net/torch/test_target_device.py
+++ b/graph_net/torch/test_target_device.py
@@ -8,7 +8,7 @@
 from graph_net_bench import path_utils
 from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
-from graph_net_bench.torch import test_compiler, test_reference_device
+from graph_net_bench.torch import test_compiler, utils, eval_backend_perf
 
 
 def parse_config_from_reference_log(log_path):
@@ -46,9 +46,7 @@ def parse_time_stats_from_reference_log(log_path):
 
 
 def update_args_and_set_seed(args, model_path):
-    ref_log = test_reference_device.get_reference_log_path(
-        args.reference_dir, model_path
-    )
+    ref_log = utils.get_log_path(args.reference_dir, model_path)
     config = parse_config_from_reference_log(ref_log)
     vars(args)["model_path"] = model_path
     vars(args)["compiler"] = config.get("compiler")
@@ -100,14 +98,10 @@ def model_call():
     if test_compiler_util.get_subgraph_tag(args.model_path):
         model_name += "_" + test_compiler_util.get_subgraph_tag(args.model_path)
 
-    ref_dump = test_reference_device.get_reference_output_path(
-        args.reference_dir, args.model_path
-    )
+    ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
     ref_out = torch.load(str(ref_dump))
 
-    ref_log = test_reference_device.get_reference_log_path(
-        args.reference_dir, args.model_path
-    )
+    ref_log = utils.get_log_path(args.reference_dir, args.model_path)
     ref_time_stats = parse_time_stats_from_reference_log(ref_log)
 
     if success:
@@ -117,7 +111,7 @@ def model_call():
 
 
 def is_reference_log_exist(reference_dir, model_path):
-    log_path = test_reference_device.get_reference_log_path(reference_dir, model_path)
+    log_path = utils.get_log_path(reference_dir, model_path)
     return os.path.isfile(log_path)
 
 
@@ -171,16 +165,14 @@ def main(args):
 
     if path_utils.is_single_model_dir(args.model_path):
         if args.op_lib == "origin":
-            ref_log = test_reference_device.get_reference_log_path(
-                args.reference_dir, args.model_path
-            )
+            ref_log = utils.get_log_path(args.reference_dir, args.model_path)
             config = parse_config_from_reference_log(ref_log)
             vars(args)["op_lib"] = config.get("op_lib")
             test_compiler_util.print_with_log_prompt(
                 "[Config] op_lib:", args.op_lib, args.log_prompt
             )
         else:
-            test_reference_device.register_op_lib(args.op_lib)
+            eval_backend_perf.register_op_lib(args.op_lib)
 
         args = update_args_and_set_seed(args, args.model_path)
         test_single_model(args)

From ebd46af74be6fb0ee7828cb4eca27754afcf1a11 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 14:37:02 +0800
Subject: [PATCH 14/17] Add unittest on test device; minor fix

---
 graph_net/torch/test_reference_device.py |  2 +-
 test/eval_device_diff_test.sh            | 37 ++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100755 test/eval_device_diff_test.sh

diff --git a/graph_net/torch/test_reference_device.py b/graph_net/torch/test_reference_device.py
index 33d0ec8e4..6a28095e4 100644
--- a/graph_net/torch/test_reference_device.py
+++ b/graph_net/torch/test_reference_device.py
@@ -119,7 +119,7 @@ def test_multi_models(args):
 def main(args):
     assert os.path.isdir(args.model_path)
     # Support all torch compilers
-    valid_compilers = list(test_compiler.registry_backend.keys())
+    valid_compilers = list(test_compiler.compiler_backend_name2class.keys())
     assert (
         args.compiler in valid_compilers
     ), f"Compiler must be one of {valid_compilers}"
diff --git a/test/eval_device_diff_test.sh b/test/eval_device_diff_test.sh
new file mode 100755
index 000000000..10e0ab766
--- /dev/null
+++ b/test/eval_device_diff_test.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+AI4C_ROOT=$(python3 -c "import graph_net_bench; import os; print(os.path.dirname(os.path.dirname(graph_net_bench.__file__)))")
+OUTPUT_PATH=/tmp/workspace_eval_device_diff_test
+REFERENCE_DIR="$OUTPUT_PATH/reference"
+
+mkdir -p "$OUTPUT_PATH"
+mkdir -p "$REFERENCE_DIR"
+
+MODEL_PATH="$AI4C_ROOT/samples/ultralytics/yolov3-tinyu"
+
+echo "=========================================="
+echo "Step 1: Generate reference on device A (simulated)"
+echo "=========================================="
+python3 -m graph_net.torch.test_reference_device \
+    --model-path "$MODEL_PATH" \
+    --compiler nope \
+    --device cuda \
+    --warmup 1 \
+    --trials 1 \
+    --reference-dir "$REFERENCE_DIR" \
+    2>&1 | tee "$OUTPUT_PATH/reference.log"
+
+echo ""
+echo "=========================================="
+echo "Step 2: Compare on device B (simulated)"
+echo "=========================================="
+python3 -m graph_net.torch.test_target_device \
+    --model-path "$MODEL_PATH" \
+    --device cuda \
+    --reference-dir "$REFERENCE_DIR" \
+    2>&1 | tee "$OUTPUT_PATH/target.log"
+
+echo ""
+echo "=========================================="
+echo "Test completed. Logs saved to: $OUTPUT_PATH"
+echo "=========================================="
\ No newline at end of file

From 74b5238ef64ed5a1c08eceed44bbe0e77cc3f72c Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 15:15:45 +0800
Subject: [PATCH 15/17] reuse eval_backend_perf, eval_backend_diff instead of
 test_compiler in test_device

---
 graph_net/torch/test_reference_device.py |  96 +++++--------------
 graph_net/torch/test_target_device.py    | 114 +++++++++--------------
 2 files changed, 66 insertions(+), 144 deletions(-)

diff --git a/graph_net/torch/test_reference_device.py b/graph_net/torch/test_reference_device.py
index 6a28095e4..bb80c1e8c 100644
--- a/graph_net/torch/test_reference_device.py
+++ b/graph_net/torch/test_reference_device.py
@@ -1,76 +1,33 @@
 import argparse
-import torch
 import os
-from pathlib import Path
-from contextlib import redirect_stdout, redirect_stderr
-import json
 import sys
-import traceback
+import types
+from pathlib import Path
 
 from graph_net_bench import path_utils
-from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
-from graph_net_bench.torch import test_compiler
-from graph_net_bench.torch import utils, eval_backend_perf
+from graph_net_bench.torch import eval_backend_perf
+
+
+def convert_args_for_eval_backend(args):
+    """Convert test_reference_device args to eval_backend_perf args format."""
+    return types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=args.reference_dir,
+        seed=args.seed,
+        compiler=args.compiler,
+        device=args.device,
+        op_lib=args.op_lib,
+        warmup=args.warmup,
+        trials=args.trials,
+        log_prompt=args.log_prompt,
+        backend_config=getattr(args, "config", None),
+    )
 
 
 def test_single_model(args):
-    ref_log = utils.get_log_path(args.reference_dir, args.model_path)
-    ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
-    print(f"Reference log path: {ref_log}", file=sys.stderr, flush=True)
-    print(f"Reference outputs path: {ref_dump}", file=sys.stderr, flush=True)
-
-    with open(ref_log, "w", encoding="utf-8") as log_f:
-        with redirect_stdout(log_f), redirect_stderr(log_f):
-            compiler = test_compiler.get_compiler_backend(args)
-
-            input_dict = test_compiler.get_input_dict(args)
-            model = test_compiler.get_model(args)
-            model.eval()
-
-            test_compiler_util.print_with_log_prompt(
-                "[Config] seed:", args.seed, args.log_prompt
-            )
-
-            test_compiler_util.print_basic_config(
-                args,
-                test_compiler.get_hardward_name(args),
-                test_compiler.get_compile_framework_version(args),
-            )
-
-            test_compiler_util.print_with_log_prompt(
-                "[Config] op_lib:", args.op_lib, args.log_prompt
-            )
-
-            success = False
-            time_stats = {}
-            try:
-                compiled_model = compiler(model)
-
-                def model_call():
-                    return compiled_model(**input_dict)
-
-                outputs, time_stats = test_compiler.measure_performance(
-                    model_call, args, compiler
-                )
-                success = True
-            except Exception as e:
-                print(
-                    f"Run model failed: {str(e)}\n{traceback.format_exc()}",
-                    file=sys.stderr,
-                    flush=True,
-                )
-
-            test_compiler_util.print_running_status(args, success)
-            if success:
-                torch.save(outputs, str(ref_dump))
-            test_compiler_util.print_with_log_prompt(
-                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
-            )
-
-    with open(ref_log, "r", encoding="utf-8") as f:
-        content = f.read()
-        print(content, file=sys.stderr, flush=True)
+    eval_args = convert_args_for_eval_backend(args)
+    eval_backend_perf.eval_single_model_with_single_backend(eval_args)
 
 
 def test_multi_models(args):
@@ -118,14 +75,9 @@ def test_multi_models(args):
 
 def main(args):
     assert os.path.isdir(args.model_path)
-    # Support all torch compilers
-    valid_compilers = list(test_compiler.compiler_backend_name2class.keys())
-    assert (
-        args.compiler in valid_compilers
-    ), f"Compiler must be one of {valid_compilers}"
-    assert args.device in ["cuda"]
-
-    test_compiler.set_seed(random_seed=args.seed)
+    assert args.device in ["cuda", "cpu"]
+
+    eval_backend_perf.set_seed(args.seed)
 
     ref_dump_dir = Path(args.reference_dir)
     ref_dump_dir.mkdir(parents=True, exist_ok=True)
diff --git a/graph_net/torch/test_target_device.py b/graph_net/torch/test_target_device.py
index cf56dee69..ee46ceee6 100644
--- a/graph_net/torch/test_target_device.py
+++ b/graph_net/torch/test_target_device.py
@@ -1,14 +1,13 @@
 import argparse
 import os
-import json
 import sys
-import traceback
+import types
 
 import torch
 from graph_net_bench import path_utils
 from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
-from graph_net_bench.torch import test_compiler, utils, eval_backend_perf
+from graph_net_bench.torch import utils, eval_backend_perf, eval_backend_diff
 
 
 def parse_config_from_reference_log(log_path):
@@ -30,84 +29,55 @@ def parse_config_from_reference_log(log_path):
     return config
 
 
-def parse_time_stats_from_reference_log(log_path):
-    assert os.path.isfile(
-        log_path
-    ), f"{log_path} does not exist or is not a regular file."
-
-    with open(log_path, "r", encoding="utf-8") as f:
-        lines = f.readlines()
-        for line in reversed(lines):
-            if "[Performance][eager]" in line:
-                start = line.find("{")
-                end = line.rfind("}")
-                time_stats = json.loads(line[start : end + 1])
-    return time_stats
-
-
-def update_args_and_set_seed(args, model_path):
+def get_ref_config_from_log(args, model_path):
+    """Extract config from reference log file."""
     ref_log = utils.get_log_path(args.reference_dir, model_path)
     config = parse_config_from_reference_log(ref_log)
-    vars(args)["model_path"] = model_path
-    vars(args)["compiler"] = config.get("compiler")
-    vars(args)["trials"] = int(config.get("trials"))
-    vars(args)["warmup"] = int(config.get("warmup"))
-    test_compiler.set_seed(random_seed=int(config.get("seed")))
-    return args
-
-
-def test_single_model(args):
-    compiler = test_compiler.get_compiler_backend(args)
+    return config
 
-    input_dict = test_compiler.get_input_dict(args)
-    model = test_compiler.get_model(args)
-    model.eval()
 
-    model_path = os.path.normpath(args.model_path)
-    test_compiler_util.print_with_log_prompt(
-        "[Processing]", model_path, args.log_prompt
-    )
-    test_compiler_util.print_basic_config(
-        args,
-        test_compiler.get_hardward_name(args),
-        test_compiler.get_compile_framework_version(args),
+def convert_args_for_eval_backend(args, output_path):
+    """Convert test_target_device args to eval_backend_perf args format."""
+    return types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=output_path,
+        seed=args.seed,
+        compiler=args.compiler,
+        device=args.device,
+        op_lib=args.op_lib,
+        warmup=args.warmup,
+        trials=args.trials,
+        log_prompt=args.log_prompt,
+        backend_config=getattr(args, "config", None),
     )
 
-    success = False
-    time_stats = {}
-    try:
-        compiled_model = compiler(model)
 
-        def model_call():
-            return compiled_model(**input_dict)
-
-        outputs, time_stats = test_compiler.measure_performance(
-            model_call, args, compiler
-        )
-        success = True
-    except Exception as e:
-        print(
-            f"Run model failed: {str(e)}\n{traceback.format_exc()}",
-            file=sys.stderr,
-            flush=True,
-        )
+def test_single_model(args):
+    target_dir = "/tmp/eval_device_diff/target"
 
-    test_compiler_util.print_running_status(args, success)
+    ref_config = get_ref_config_from_log(args, args.model_path)
+    vars(args)["compiler"] = ref_config.get("compiler")
+    vars(args)["trials"] = int(ref_config.get("trials"))
+    vars(args)["warmup"] = int(ref_config.get("warmup"))
+    vars(args)["seed"] = int(ref_config.get("seed"))
 
-    model_name = test_compiler_util.get_model_name(args.model_path)
-    if test_compiler_util.get_subgraph_tag(args.model_path):
-        model_name += "_" + test_compiler_util.get_subgraph_tag(args.model_path)
+    eval_args = convert_args_for_eval_backend(args, target_dir)
+    eval_backend_perf.eval_single_model_with_single_backend(eval_args)
 
     ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
     ref_out = torch.load(str(ref_dump))
-
     ref_log = utils.get_log_path(args.reference_dir, args.model_path)
-    ref_time_stats = parse_time_stats_from_reference_log(ref_log)
+    ref_time_stats = eval_backend_diff.parse_time_stats_from_reference_log(ref_log)
 
-    if success:
-        test_compiler.compare_correctness(ref_out, outputs, args)
+    target_dump = utils.get_output_path(target_dir, args.model_path)
+    target_out = torch.load(str(target_dump))
+    target_log = utils.get_log_path(target_dir, args.model_path)
+    target_time_stats = eval_backend_diff.parse_time_stats_from_reference_log(
+        target_log
+    )
 
-    test_compiler_util.print_times_and_speedup(args, ref_time_stats, time_stats)
+    eval_backend_diff.compare_correctness(ref_out, target_out, eval_args)
+    test_compiler_util.print_times_and_speedup(args, ref_time_stats, target_time_stats)
 
 
 def is_reference_log_exist(reference_dir, model_path):
@@ -165,16 +135,16 @@ def main(args):
 
     if path_utils.is_single_model_dir(args.model_path):
         if args.op_lib == "origin":
-            ref_log = utils.get_log_path(args.reference_dir, args.model_path)
-            config = parse_config_from_reference_log(ref_log)
-            vars(args)["op_lib"] = config.get("op_lib")
-            test_compiler_util.print_with_log_prompt(
-                "[Config] op_lib:", args.op_lib, args.log_prompt
+            ref_config = get_ref_config_from_log(args, args.model_path)
+            vars(args)["op_lib"] = ref_config.get("op_lib")
+            print(
+                f"{args.log_prompt} [Config] op_lib: {args.op_lib}",
+                file=sys.stderr,
+                flush=True,
             )
         else:
             eval_backend_perf.register_op_lib(args.op_lib)
 
-        args = update_args_and_set_seed(args, args.model_path)
         test_single_model(args)
     else:
         test_multi_models(args)

From d8514e4a13f67aac1bcd293cbba62cd860008b21 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 16:48:12 +0800
Subject: [PATCH 16/17] move utest

---
 .../test/test_device_test.sh                                      | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/eval_device_diff_test.sh => graph_net/test/test_device_test.sh (100%)

diff --git a/test/eval_device_diff_test.sh b/graph_net/test/test_device_test.sh
similarity index 100%
rename from test/eval_device_diff_test.sh
rename to graph_net/test/test_device_test.sh

From b83b6a967770a644881a6751800ef7e7dc144a28 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 16:53:13 +0800
Subject: [PATCH 17/17] minor change

---
 graph_net_bench/torch/eval_backend_diff.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index c254eafaf..cfa171dc6 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -190,8 +190,8 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
 
 
 def eval_single_model(args):
-    ref_dir = "/tmp/eval_perf_diff/A"
-    target_dir = "/tmp/eval_perf_diff/B"
+    ref_dir = "/tmp/eval_perf_diff/reference"
+    target_dir = "/tmp/eval_perf_diff/target"
 
     ref_args = types.SimpleNamespace(
         model_path=args.model_path,