diff --git a/mlir/utils/performance/tests/conftest.py b/mlir/utils/performance/tests/conftest.py
new file mode 100644
index 000000000000..c7169ec7d1b9
--- /dev/null
+++ b/mlir/utils/performance/tests/conftest.py
@@ -0,0 +1,31 @@
+import sys
+import types
+from pathlib import Path
+
+# Ensure the performance utilities are importable as top-level modules.
+PERFORMANCE_DIR = Path(__file__).resolve().parent.parent
+if str(PERFORMANCE_DIR) not in sys.path:
+    sys.path.insert(0, str(PERFORMANCE_DIR))
+
+
+# Provide a light-weight stub for the optional HIP dependency so imports succeed
+# on hosts without a GPU runtime.
+if "hip" not in sys.modules:
+    hip_module = types.ModuleType("hip")
+    fake_hip = types.SimpleNamespace()
+
+    class FakeHipError(int):
+        hipSuccess = 0
+
+    class FakeDeviceProp:
+        def __init__(self):
+            self.gcnArchName = b"gfx000"
+            self.computeUnit = 0
+
+    fake_hip.hipError_t = FakeHipError
+    fake_hip.hipDeviceProp_t = FakeDeviceProp
+    fake_hip.hipGetDeviceCount = lambda: (FakeHipError(0), 0)
+    fake_hip.hipGetDeviceProperties = lambda props, device: (FakeHipError(0),)
+
+    hip_module.hip = fake_hip
+    sys.modules["hip"] = hip_module
diff --git a/mlir/utils/performance/tests/test_perfRunner.py b/mlir/utils/performance/tests/test_perfRunner.py
new file mode 100644
index 000000000000..a41a0b51d88c
--- /dev/null
+++ b/mlir/utils/performance/tests/test_perfRunner.py
@@ -0,0 +1,326 @@
+import perfRunner
+
+
+def test_layout_round_trip():
+    assert perfRunner.input_layouts("NCHW") == "nchw"
+    assert perfRunner.output_layouts("NCHW") == "nkhw"
+    assert perfRunner.filter_layouts("NCHW") == "kcyx"
+    assert perfRunner.inverse_input_layouts("nchw") == "NCHW"
+    assert perfRunner.inverse_output_layouts("nkhw") == "NCHW"
+    assert perfRunner.inverse_filter_layouts("kcyx") == "NCHW"
+
+
+def test_get_conv_configurations_uses_defaults(monkeypatch, tmp_path):
+    monkeypatch.setattr(perfRunner, "DIRECTIONS", ["-F 1"])
+    monkeypatch.setattr(perfRunner, "DATA_TYPES", ["conv"])
+    monkeypatch.setattr(perfRunner, "LAYOUTS", ["NHWC"])
+    monkeypatch.setattr(perfRunner, "get_chip", lambda: "gfx1200")
+
+    config_file = tmp_path / "conv.txt"
+    config_file.write_text(
+        "-n 1 -c 2 -H 3 -W 4 -k 5 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -g 1\n"
+    )
+
+    configs = perfRunner.get_conv_configurations(str(config_file))
+
+    assert configs == [
+        "conv -F 1 -f NHWC -I NHWC -O NHWC "
+        "-n 1 -c 2 -H 3 -W 4 -k 5 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -g 1"
+    ]
+
+
+def test_conv_configuration_command_generation():
+    cfg = perfRunner.ConvConfiguration(
+        "f32",
+        "fwd",
+        "NCHW",
+        "NCHW",
+        "NCHW",
+        n=1,
+        c=2,
+        hi=3,
+        wi=4,
+        k=5,
+        y=1,
+        x=1,
+        conv_stride_h=1,
+        conv_stride_w=1,
+        padding_h=0,
+        padding_w=0,
+        dilation_h=1,
+        dilation_w=1,
+        group=1,
+        arch="gfx1200",
+        num_cu=10,
+    )
+
+    command = cfg.generate_mlir_driver_commandline("", kernel_repeats=5)
+    assert "--operation conv" in command
+    assert "--fil_layout kcyx" in command
+    assert "--in_layout nchw" in command
+    assert "--out_layout nkhw" in command
+    assert "--kernel-repeats 5" in command
+    assert "--perf_config=" in command
+
+    round_trip = cfg.to_command_line()
+    assert "conv " in round_trip
+    assert "-F 1" in round_trip
+    assert "-f NCHW" in round_trip
+    assert "-I NCHW" in round_trip
+    assert "-O NCHW" in round_trip
+
+
+def test_gemm_configuration_command_generation():
+    cfg = perfRunner.GemmConfiguration(
+        dtype="f32",
+        out_dtype="f32",
+        g=1,
+        m=64,
+        k=32,
+        n=16,
+        trans_a=False,
+        trans_b=True,
+        scaled_gemm=True,
+        scale_a_dtype="f32",
+        scale_b_dtype="f8E8M0FNU",
+        trans_scale_a=True,
+        trans_scale_b=False,
+        arch="gfx1200",
+        num_cu=8,
+        perf_config="best",
+    )
+
+    command = cfg.generate_mlir_driver_commandline("", kernel_repeats=7)
+    assert "-operation gemm" in command
+    assert "-scaledGemm" in command
+    assert "-scale_a_dtype f32" in command
+    assert "-scale_b_dtype f8E8M0FNU" in command
+    assert "-transScaleA=True" in command
+    assert "-transScaleB" not in command  # only omitted when False
+    assert "--kernel-repeats 7" in command
+    assert "--perf_config=best" in command
+
+    round_trip = cfg.to_command_line()
+    assert "-t f32" in round_trip
+    assert "-out_datatype f32" in round_trip
+    assert "-transA false" in round_trip
+    assert "-transB true" in round_trip
+    assert "-scaledGemm" in round_trip
+    assert "-scale_a_dtype f32" in round_trip
+    assert "-scale_b_dtype f8E8M0FNU" in round_trip
+    assert "-transScaleA true" in round_trip
+    assert "-transScaleB" not in round_trip  # only emitted when True
+
+
+def test_attention_configuration_command_generation(monkeypatch):
+    monkeypatch.setattr(perfRunner, "DATA_TYPES_ATTENTION", ["f16"])
+
+    cfg = perfRunner.AttentionConfiguration(
+        dtype="f16",
+        g=1,
+        seq_len_q=128,
+        seq_len_k=128,
+        num_heads_q=4,
+        num_heads_kv=4,
+        head_dim_qk=64,
+        head_dim_v=64,
+        with_attn_scale=True,
+        with_attn_bias=False,
+        trans_q=False,
+        trans_k=True,
+        trans_v=False,
+        trans_o=True,
+        causal=True,
+        return_lse=False,
+        split_kv=1,
+        arch="gfx1200",
+        num_cu=12,
+        perf_config="fast",
+    )
+
+    command = cfg.generate_mlir_driver_commandline("", kernel_repeats=3)
+    assert "-operation attention" in command
+    assert "-t f16" in command
+    assert "-with-attn-scale=True" in command
+    assert "-with-attn-bias=False" in command
+    assert "-transQ=False" in command
+    assert "-transK=True" in command
+    assert "-transO=True" in command
+    assert "-causal=True" in command
+    assert "-return_lse=False" in command
+    assert "-split_kv=1" in command
+    assert "--kernel-repeats 3" in command
+    assert "--perf_config=fast" in command
+
+    round_trip = cfg.to_command_line()
+    assert "-t f16" in round_trip
+    assert "-transQ false" in round_trip
+    assert "-transK true" in round_trip
+    assert "-transO true" in round_trip
+    assert "-causal true" in round_trip
+    assert "-return_lse false" in round_trip
+    assert "-split_kv 1" in round_trip
+    assert "-with-attn-scale true" in round_trip
+    assert "-with-attn-bias false" in round_trip
+
+
+def test_conv_configuration_from_to_command_line_round_trip(monkeypatch):
+    monkeypatch.setattr(perfRunner, "get_chip", lambda: "gfx1200")
+    args = [
+        "conv",
+        "-F",
+        "1",
+        "-f",
+        "NHWC",
+        "-I",
+        "NHWC",
+        "-O",
+        "NHWC",
+        "-n",
+        "2",
+        "-c",
+        "4",
+        "-H",
+        "8",
+        "-W",
+        "8",
+        "-k",
+        "16",
+        "-y",
+        "3",
+        "-x",
+        "3",
+        "-u",
+        "1",
+        "-v",
+        "1",
+        "-p",
+        "1",
+        "-q",
+        "1",
+        "-l",
+        "1",
+        "-j",
+        "1",
+        "-g",
+        "1",
+    ]
+    cfg = perfRunner.ConvConfiguration.from_command_line(args, arch="gfx1200", num_cu=6)
+    round_trip = cfg.to_command_line()
+    assert "-n 2" in round_trip
+    assert "-c 4" in round_trip
+    assert "-F 1" in round_trip
+    assert cfg.arch == "gfx1200"
+    assert cfg.num_cu == 6
+
+
+def test_gemm_configuration_from_to_command_line_round_trip():
+    args = [
+        "-t",
+        "f16",
+        "-out_datatype",
+        "f16",
+        "-g",
+        "1",
+        "-m",
+        "64",
+        "-k",
+        "32",
+        "-n",
+        "16",
+        "-transA",
+        "false",
+        "-transB",
+        "true",
+        "-scaledGemm",
+        "-scale_a_dtype",
+        "f32",
+        "-scale_b_dtype",
+        "f8E8M0FNU",
+        "-perf_config",
+        "cfg",
+    ]
+    cfg = perfRunner.GemmConfiguration.from_command_line(args, arch="gfx1200", num_cu=4)
+    round_trip = cfg.to_command_line()
+    assert "-t f16" in round_trip
+    assert "-out_datatype f16" in round_trip
+    assert "-g 1" in round_trip
+    assert "-transA false" in round_trip
+    assert "-transB true" in round_trip
+    assert "-scale_a_dtype f32" in round_trip
+    assert "-scale_b_dtype f8E8M0FNU" in round_trip
+    assert cfg.perfconfig == "cfg"
+    assert cfg.arch == "gfx1200"
+    assert cfg.num_cu == 4
+
+
+def test_attention_configuration_from_to_command_line_round_trip(monkeypatch):
+    monkeypatch.setattr(perfRunner, "DATA_TYPES_ATTENTION", ["f16"])
+    args = [
+        "-t",
+        "f16",
+        "-g",
+        "2",
+        "-seq_len_q",
+        "128",
+        "-seq_len_k",
+        "128",
+        "-num_heads_q",
+        "4",
+        "-num_heads_kv",
+        "4",
+        "-head_dim_qk",
+        "64",
+        "-head_dim_v",
+        "64",
+        "-with-attn-scale",
+        "true",
+        "-with-attn-bias",
+        "false",
+        "-transQ",
+        "false",
+        "-transK",
+        "true",
+        "-transV",
+        "false",
+        "-transO",
+        "true",
+        "-causal",
+        "true",
+        "-return_lse",
+        "false",
+        "-split_kv",
+        "1",
+        "-perf_config",
+        "pc",
+    ]
+    cfg = perfRunner.AttentionConfiguration.from_command_line(args, arch="gfx1200", num_cu=10)
+    round_trip = cfg.to_command_line()
+    assert "-t f16" in round_trip
+    assert "-g 2" in round_trip
+    assert "-seq_len_q 128" in round_trip
+    assert "-num_heads_q 4" in round_trip
+    assert "-with-attn-scale true" in round_trip
+    assert "-with-attn-bias false" in round_trip
+    assert "-transK true" in round_trip
+    assert "-causal true" in round_trip
+    assert cfg.perfconfig == "pc"
+    assert cfg.arch == "gfx1200"
+    assert cfg.num_cu == 10
+
+
+def test_get_gemm_configurations_scaled(monkeypatch, tmp_path):
+    monkeypatch.setattr(perfRunner, "get_chip", lambda: "gfx1200")
+    config_file = tmp_path / "gemm.txt"
+    config_file.write_text("-m 128 -k 64 -n 256 -scaledGemm\n")
+
+    configs = perfRunner.get_gemm_configurations(
+        str(config_file), datatypes=["f16"], scale_types=["f32", "f8E8M0FNU"]
+    )
+
+    assert len(configs) == 16  # 1 dtype * 2 transA * 2 transB * 2 scale_a * 2 scale_b
+    assert any(
+        "-scale_a_dtype f32" in entry and "-scale_b_dtype f8E8M0FNU" in entry
+        for entry in configs
+    )
+    assert all("-t f16" in entry for entry in configs)
diff --git a/mlir/utils/performance/tests/test_tuningRunner.py b/mlir/utils/performance/tests/test_tuningRunner.py
new file mode 100644
index 000000000000..de22a30a43d5
--- /dev/null
+++ b/mlir/utils/performance/tests/test_tuningRunner.py
@@ -0,0 +1,59 @@
+import numpy as np
+
+import tuningRunner
+
+
+def test_verify_mode_flags():
+    assert tuningRunner.verify_mode_flags("none") == ""
+    assert tuningRunner.verify_mode_flags("cpu").strip() == "-pv"
+    assert "--verifier-keep-perf-config=false" in tuningRunner.verify_mode_flags("gpu")
+    try:
+        tuningRunner.verify_mode_flags("unknown")
+        assert False, "verify_mode_flags should raise for unknown modes"
+    except ValueError:
+        pass
+
+
+def test_get_winning_config_prefers_fastest(monkeypatch):
+    class DummyConfig:
+        def __init__(self):
+            self.perfconfigs = []
+
+        def set_perfconfig(self, perfconfig):
+            self.perfconfigs.append(perfconfig)
+
+        def table_entry(self, nanoseconds):
+            score = np.nan
+            if not np.isnan(nanoseconds):
+                score = 1000.0 / nanoseconds
+            return {"TFlops": score}
+
+    options = tuningRunner.Options(
+        debug=False,
+        tuning_space_kind="full",
+        quiet=True,
+        arch="gfx1200",
+        num_cu=10,
+        rocmlir_gen_flags="",
+        verify_mode="none",
+        verify_perfconfigs=False,
+        tflops=False,
+        compact_print=True,
+    )
+
+    dummy_config = DummyConfig()
+    all_data = []
+    winner, max_tflops = tuningRunner.get_winning_config(
+        [b"fast\t5", b"slow\t10", b"skip_me\tN/A"],
+        "vector",
+        dummy_config,
+        all_data,
+        paths=None,
+        options=options,
+    )
+
+    assert winner == "fast"
+    assert max_tflops == 200.0
+    assert len(all_data) == 3
+    assert dummy_config.perfconfigs[0] == "fast"
+    assert dummy_config.perfconfigs[1] == "slow"