From 00f5eb79c08ca7c4f9f7d457abcf933613c67c8e Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Mon, 8 Dec 2025 11:11:51 +0000
Subject: [PATCH 01/17] feat: draft kernel tuning based on vllm fns

---
 src/pruna/algorithms/moe_kernel_tuner.py | 120 +++++++++++++++++++++++
 src/pruna/engine/model_checks.py         |  36 +++++++
 2 files changed, 156 insertions(+)
 create mode 100644 src/pruna/algorithms/moe_kernel_tuner.py

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
new file mode 100644
index 00000000..8a558f7c
--- /dev/null
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -0,0 +1,120 @@
+# Copyright 2025 - Pruna AI GmbH. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import functools
+from collections.abc import Iterable
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+import ray
+import ray.experimental.tqdm_ray as tqdm_ray
+
+from pruna.algorithms.base.pruna_base import PrunaAlgorithmBase
+from pruna.algorithms.base.tags import AlgorithmTag as tags
+from pruna.config.smash_config import SmashConfigPrefixWrapper
+from pruna.engine.model_checks import is_moe_lm, is_transformers_pipeline_with_moe_lm
+from pruna.logging.logger import pruna_logger
+
+
+class MoeKernelTuner(PrunaAlgorithmBase):
+    """
+    Tune the MoE kernel for the model.
+
+    Uses vLLM to tune the MoE kernel of the model.
+    """
+
+    algorithm_name: str = "moe_kernel_tuner"
+    group_tags: list[str] = [tags.KERNEL]
+    save_fn :None = None
+    references: dict[str, str] = {
+        "GitHub": "https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py",
+    }
+    tokenizer_required: bool = False
+    processor_required: bool = False
+    runs_on: list[str] = ["cuda", "accelerate"]
+    dataset_required: bool = False
+    compatible_before: Iterable[str] = [tags.KERNEL, tags.QUANTIZER, tags.PRUNER, tags.CACHER, tags.FACTORIZER, tags.BATCHER, tags.COMPILER]
+    compatible_after: Iterable[str] = [tags.KERNEL, tags.QUANTIZER, tags.PRUNER, tags.CACHER, tags.FACTORIZER, tags.BATCHER, tags.COMPILER]
+    required_install = "``uv pip install vllm``"
+
+    def model_check_fn(self, model: Any) -> bool:
+        """
+        Check if the model is a MoE model.
+
+        Parameters
+        ----------
+        model : Any
+            The model to check.
+
+        Returns
+        -------
+        bool
+            True if the model is a valid model for the algorithm, False otherwise.
+        """
+        # Hunyuan3-image is a MoE model, but not depending on mixtral
+        if model.__class__.__name__ == "HunyuanImage3ForCausalMM":
+            return True
+        else:
+            return is_moe_lm(model) or is_transformers_pipeline_with_moe_lm(model)
+
+    def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
+        """
+        Wrap the model to use flash_attn3 where possible.
+
+        Parameters
+        ----------
+        model : Any
+            The model to wrap.
+        smash_config : SmashConfigPrefixWrapper
+            The configuration for the application of the algorithm.
+
+        Returns
+        -------
+        Any
+            The wrapped model.
+        """
+        imported_packages = self.import_algorithm_packages()
+
+        #TODO: Implement the MoE kernel tuning.
+        return model
+
+    def import_algorithm_packages(self) -> Dict[str, Any]:
+        """
+        Import the algorithm packages.
+
+        Returns
+        -------
+        Dict[str, Any]
+            The algorithm packages.
+        """
+        from vllm.model_executor.layers.fused_moe.config import (
+            FusedMoEQuantConfig,
+            _get_config_dtype_str,
+        )
+        import vllm.model_executor.layers.fused_moe.fused_moe as fused_moe
+        import vllm.platforms as vllm_platforms
+        from vllm.transformers_utils.config import get_config
+        from vllm.triton_utils import triton
+        from vllm.utils.argparse_utils import FlexibleArgumentParser
+        return dict(    
+            FusedMoEQuantConfig=FusedMoEQuantConfig,
+            _get_config_dtype_str=_get_config_dtype_str,
+            FusedMoE=fused_moe,
+            vllm_platforms=vllm_platforms,
+            get_config=get_config,
+            triton=triton,
+            FlexibleArgumentParser=FlexibleArgumentParser,
+            tqdm_ray=tqdm_ray,
+        )
diff --git a/src/pruna/engine/model_checks.py b/src/pruna/engine/model_checks.py
index 3ea41321..6a4bbb75 100644
--- a/src/pruna/engine/model_checks.py
+++ b/src/pruna/engine/model_checks.py
@@ -105,6 +105,25 @@ def is_speech_seq2seq_model(model: Any) -> bool:
     return False
 
 
+def is_moe_lm(model: Any) -> bool:
+    """
+    Check if the model is a MoE LM.
+
+    Currently all MoE LMs are based on Mixtral in transformers.
+
+    Parameters
+    ----------
+    model : Any
+        The model to check.
+
+    Returns
+    -------
+    bool
+        True if the model is a MoE LM, False otherwise.
+    """
+    return hasattr(model, "num_experts")
+
+
 def is_transformers_pipeline_with_causal_lm(model: Any) -> bool:
     """
     Check if the model is a transformers pipeline (for tasks like text generation, classification, etc.).
@@ -158,6 +177,23 @@ def is_transformers_pipeline_with_speech_recognition(model: Any) -> bool:
     )
 
 
+def is_transformers_pipeline_with_moe_lm(model: Any) -> bool:
+    """
+    Check if the model is a transformers pipeline with a MoE LM.
+
+    Parameters
+    ----------
+    model : Any
+        The model to check.
+
+    Returns
+    -------
+    bool
+        True if the model is a transformers pipeline with a MoE LM, False otherwise.
+    """
+    return isinstance(model, TextGenerationPipeline) and is_moe_lm(getattr(model, "model", None))
+
+
 def is_diffusers_pipeline(model: Any, include_video: bool = False) -> bool:
     """
     Check if the model is a diffusers pipeline.

From 1a0d45eca05484b5761ad60e3dcf21f90dddb0b7 Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Mon, 8 Dec 2025 11:12:28 +0000
Subject: [PATCH 02/17] feat: draft kernel tuning based on vllm fns

---
 src/pruna/algorithms/moe_kernel_tuner.py | 37 ++++++++++++++++--------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index 8a558f7c..59e794e4 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -13,19 +13,15 @@
 # limitations under the License.
 from __future__ import annotations
 
-import functools
 from collections.abc import Iterable
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict
 
-import torch
-import ray
 import ray.experimental.tqdm_ray as tqdm_ray
 
 from pruna.algorithms.base.pruna_base import PrunaAlgorithmBase
 from pruna.algorithms.base.tags import AlgorithmTag as tags
 from pruna.config.smash_config import SmashConfigPrefixWrapper
 from pruna.engine.model_checks import is_moe_lm, is_transformers_pipeline_with_moe_lm
-from pruna.logging.logger import pruna_logger
 
 
 class MoeKernelTuner(PrunaAlgorithmBase):
@@ -37,7 +33,7 @@ class MoeKernelTuner(PrunaAlgorithmBase):
 
     algorithm_name: str = "moe_kernel_tuner"
     group_tags: list[str] = [tags.KERNEL]
-    save_fn :None = None
+    save_fn: None = None
     references: dict[str, str] = {
         "GitHub": "https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py",
     }
@@ -45,8 +41,24 @@ class MoeKernelTuner(PrunaAlgorithmBase):
     processor_required: bool = False
     runs_on: list[str] = ["cuda", "accelerate"]
     dataset_required: bool = False
-    compatible_before: Iterable[str] = [tags.KERNEL, tags.QUANTIZER, tags.PRUNER, tags.CACHER, tags.FACTORIZER, tags.BATCHER, tags.COMPILER]
-    compatible_after: Iterable[str] = [tags.KERNEL, tags.QUANTIZER, tags.PRUNER, tags.CACHER, tags.FACTORIZER, tags.BATCHER, tags.COMPILER]
+    compatible_before: Iterable[str] = [
+        tags.KERNEL,
+        tags.QUANTIZER,
+        tags.PRUNER,
+        tags.CACHER,
+        tags.FACTORIZER,
+        tags.BATCHER,
+        tags.COMPILER,
+    ]
+    compatible_after: Iterable[str] = [
+        tags.KERNEL,
+        tags.QUANTIZER,
+        tags.PRUNER,
+        tags.CACHER,
+        tags.FACTORIZER,
+        tags.BATCHER,
+        tags.COMPILER,
+    ]
     required_install = "``uv pip install vllm``"
 
     def model_check_fn(self, model: Any) -> bool:
@@ -87,7 +99,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         """
         imported_packages = self.import_algorithm_packages()
 
-        #TODO: Implement the MoE kernel tuning.
+        # TODO: Implement the MoE kernel tuning.
         return model
 
     def import_algorithm_packages(self) -> Dict[str, Any]:
@@ -99,16 +111,17 @@ def import_algorithm_packages(self) -> Dict[str, Any]:
         Dict[str, Any]
             The algorithm packages.
         """
+        import vllm.model_executor.layers.fused_moe.fused_moe as fused_moe
+        import vllm.platforms as vllm_platforms
         from vllm.model_executor.layers.fused_moe.config import (
             FusedMoEQuantConfig,
             _get_config_dtype_str,
         )
-        import vllm.model_executor.layers.fused_moe.fused_moe as fused_moe
-        import vllm.platforms as vllm_platforms
         from vllm.transformers_utils.config import get_config
         from vllm.triton_utils import triton
         from vllm.utils.argparse_utils import FlexibleArgumentParser
-        return dict(    
+
+        return dict(
             FusedMoEQuantConfig=FusedMoEQuantConfig,
             _get_config_dtype_str=_get_config_dtype_str,
             FusedMoE=fused_moe,

From 8e3aec47d9abe44bdc33e93bd6ed28ceefe21248 Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Mon, 8 Dec 2025 16:39:09 +0000
Subject: [PATCH 03/17] feat: add benchmark fn and saving draft

---
 src/pruna/algorithms/moe_kernel_tuner.py | 452 ++++++++++++++++++++++-
 1 file changed, 446 insertions(+), 6 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index 59e794e4..14b2a0df 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -14,14 +14,24 @@
 from __future__ import annotations
 
 from collections.abc import Iterable
-from typing import Any, Dict
+from typing import Any, Dict, TypedDict
 
 import ray.experimental.tqdm_ray as tqdm_ray
+import ray
+import time
+import torch
+from ConfigSpace import CategoricalHyperparameter, OrdinalHyperparameter
+from contextlib import nullcontext
+from datetime import datetime
+import os
+import json
+from itertools import product
 
 from pruna.algorithms.base.pruna_base import PrunaAlgorithmBase
 from pruna.algorithms.base.tags import AlgorithmTag as tags
 from pruna.config.smash_config import SmashConfigPrefixWrapper
 from pruna.engine.model_checks import is_moe_lm, is_transformers_pipeline_with_moe_lm
+from pruna.logging.logger import pruna_logger
 
 
 class MoeKernelTuner(PrunaAlgorithmBase):
@@ -61,6 +71,36 @@ class MoeKernelTuner(PrunaAlgorithmBase):
     ]
     required_install = "``uv pip install vllm``"
 
+    def get_hyperparameters(self) -> list:
+        """
+        Configure all algorithm-specific hyperparameters with ConfigSpace.
+
+        Returns
+        -------
+        list
+            The hyperparameters.
+        """
+        return [
+            CategoricalHyperparameter(
+                "compute_dtype",
+                choices=["bfloat16", "float16"],
+                default_value="bfloat16",
+                meta=dict(desc="Compute dtype to use."),
+            ),
+            CategoricalHyperparameter(
+                "weight_dtype",
+                choices=["fp8_w8a8", "int8_w8a16"],
+                default_value="fp8_w8a8",
+                meta=dict(desc="Dtype to use for the weights (and activations)."),
+            ),
+            OrdinalHyperparameter(
+                "tensor_parallel_size",
+                sequence=[1, 2, 4, 8, 16, 32],
+                default_value=1,
+                meta=dict(desc="Tensor parallel size to use if the model can not fit on a single GPU."),
+            ),
+        ]
+
     def model_check_fn(self, model: Any) -> bool:
         """
         Check if the model is a MoE model.
@@ -99,7 +139,147 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         """
         imported_packages = self.import_algorithm_packages()
 
-        # TODO: Implement the MoE kernel tuning.
+        @ray.remote(num_gpus=1)
+        class BenchmarkWorker:
+            def __init__(self, seed: int) -> None:
+                torch.set_default_device("cuda")
+                imported_packages["vllm_platforms"].current_platform.seed_everything(seed)
+                self.seed = seed
+                self.device_id = int(ray.get_gpu_ids()[0])
+
+            def tune(
+                self,
+                num_tokens: int,
+                num_experts: int,
+                shard_intermediate_size: int,
+                hidden_size: int,
+                topk: int,
+                dtype: torch.dtype,
+                use_fp8_w8a8: bool,
+                use_int8_w8a16: bool,
+                search_space: list[dict[str, int]],
+                block_quant_shape: list[int],
+                use_deep_gemm: bool,
+            ) -> dict[str, int]:
+                best_config = None
+                best_time = float("inf")
+
+                need_device_guard = False
+
+                with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
+                    for config in tqdm_ray(search_space):
+                        try:
+                            kernel_time = benchmark_config(
+                                config,
+                                num_tokens,
+                                num_experts,
+                                shard_intermediate_size,
+                                hidden_size,
+                                topk,
+                                dtype,
+                                use_fp8_w8a8,
+                                use_int8_w8a16,
+                                num_iters=20,
+                                block_quant_shape=block_quant_shape,
+                                use_deep_gemm=use_deep_gemm,
+                            )
+                        except imported_packages["triton"].runtime.autotuner.OutOfResources:
+                            # Some configurations may be invalid and fail to compile.
+                            continue
+
+                        if kernel_time < best_time:
+                            best_time = kernel_time
+                            best_config = config
+                now = datetime.now()
+                pruna_logger.info(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+                assert best_config is not None
+                return best_config
+
+        E = model.num_experts if is_moe_lm(model)or is_transformers_pipeline_with_moe_lm(model) else model.num_experts                # number of experts
+        topk = model.num_experts_per_tok                    # number of active experts per token
+        intermediate_size = model.intermediate_size # 3072 # FFN intermediate size
+        hidden_size = model.hidden_size #4096        # model hidden dim
+        assert intermediate_size % smash_config["tensor_parallel_size"] == 0, (
+            f"intermediate_size {intermediate_size} is not divisible by tp "
+            f"{smash_config['tensor_parallel_size']}."
+        )
+        shard_intermediate_size = 2 * intermediate_size // smash_config["tensor_parallel_size"]
+        dtype = smash_config["compute_dtype"]
+        use_fp8_w8a8 = smash_config["weight_dtype"] == "fp8_w8a8"
+        use_int8_w8a16 = smash_config["weight_dtype"] == "int8_w8a16"
+        FP8_DTYPE = imported_packages["vllm_platforms"].current_platform.fp8_dtype()
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+
+        ray.init()
+        num_gpus = int(ray.available_resources()["GPU"])
+        workers = [BenchmarkWorker.remote(0) for _ in range(num_gpus)]
+
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        search_space = get_configs_compute_bound(is_fp16, None)
+        pruna_logger.info(f"Start tuning over {len(search_space)} configurations...")
+
+        start = time.time()
+        outputs = []
+        worker_idx = 0
+        for batch_size in batch_sizes:
+            input_args = (
+                batch_size,
+                E,
+                shard_intermediate_size,
+                hidden_size,
+                topk,
+                dtype,
+                use_fp8_w8a8,
+                use_int8_w8a16,
+                search_space,
+                None,
+                False,
+            )
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, "tune")
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        configs = ray.get(outputs)
+
+        best_configs = {
+            M: sort_config(config) for M, config in zip(batch_sizes, configs)
+        }
+        self.save_configs(
+            best_configs,
+            E,
+            shard_intermediate_size,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            None,
+            args.save_dir,
+            imported_packages,
+        )
+        end = time.time()
+        pruna_logger.info(f"Tuning took {end - start:.2f} seconds")
+
         return model
 
     def import_algorithm_packages(self) -> Dict[str, Any]:
@@ -117,17 +297,277 @@ def import_algorithm_packages(self) -> Dict[str, Any]:
             FusedMoEQuantConfig,
             _get_config_dtype_str,
         )
-        from vllm.transformers_utils.config import get_config
+        from vllm.model_executor.layers.fused_moe import override_config
         from vllm.triton_utils import triton
-        from vllm.utils.argparse_utils import FlexibleArgumentParser
 
         return dict(
             FusedMoEQuantConfig=FusedMoEQuantConfig,
             _get_config_dtype_str=_get_config_dtype_str,
             FusedMoE=fused_moe,
             vllm_platforms=vllm_platforms,
-            get_config=get_config,
             triton=triton,
-            FlexibleArgumentParser=FlexibleArgumentParser,
             tqdm_ray=tqdm_ray,
+            override_config=override_config,
+        )
+
+    def save_configs(
+        configs: dict[int, BenchmarkConfig],
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        block_quant_shape: list[int],
+        save_dir: str,
+        imported_packages: Dict[str, Any],
+    ) -> None:
+        dtype_str = imported_packages["_get_config_dtype_str"](
+            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+        )
+
+        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+        # is the intermediate size after silu_and_mul.
+        filename = imported_packages["fused_moe"].get_config_file_name(
+            num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
+        )
+
+        # We want to save at 3 different places:
+        # 1. The cache of vllm
+        # 2. The cache of kernels hub
+        # 3. The smashconfig (to be reused once mode is smashed and saved).
+
+
+        os.makedirs(save_dir, exist_ok=True)
+        filename = os.path.join(save_dir, filename)
+        pruna_logger.info(f"Writing best config to {filename}...")
+        with open(filename, "w") as f:
+            json.dump({"triton_version": imported_packages["triton"].__version__, **configs}, f, indent=4)
+            f.write("\n")
+
+
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
+    return {
+        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
+        "num_warps": config["num_warps"],
+        "num_stages": config["num_stages"],
+        **(
+            {"waves_per_eu": config["waves_per_eu"]} if "waves_per_eu" in config else {}
+        ),
+        **(
+            {"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]}
+            if "matrix_instr_nonkdim" in config
+            else {}
+        ),
+        **({"kpack": config["kpack"]} if "kpack" in config else {}),
+    }
+
+
+def get_configs_compute_bound(use_fp16, block_quant_shape) -> list[dict[str, int]]:
+    configs: list[BenchmarkConfig] = []
+
+    # Reduced search space for faster tuning.
+    block_m_range = [16, 32, 64, 128, 256]
+    block_n_range = [32, 64, 128, 256]
+    block_k_range = [64, 128, 256]
+    num_warps_range = [4, 8]
+    group_m_range = [1, 16, 32, 64]
+    num_stage_range = [2, 3, 4, 5]
+
+    param_ranges = {
+        "BLOCK_SIZE_M": block_m_range,
+        "BLOCK_SIZE_N": block_n_range,
+        "BLOCK_SIZE_K": block_k_range,
+        "GROUP_SIZE_M": group_m_range,
+        "num_warps": num_warps_range,
+        "num_stages": num_stage_range,
+    }
+
+    keys, values = zip(*param_ranges.items())
+    for config_values in product(*values):
+        config = dict(zip(keys, config_values))
+        configs.append(config)
+
+    # Remove configs that are not compatible with fp8 block quantization
+    # BLOCK_SIZE_K must be a multiple of block_k
+    # BLOCK_SIZE_N must be a multiple of block_n
+    if block_quant_shape is not None and not use_fp16:
+        block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+        for config in configs[:]:
+            if (
+                config["BLOCK_SIZE_K"] % block_k != 0
+                or config["BLOCK_SIZE_N"] % block_n != 0
+            ):
+                configs.remove(config)
+    return configs
+
+
+def benchmark_config(
+    config: BenchmarkConfig,
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    block_quant_shape: list[int] = None,
+    use_deep_gemm: bool = False,
+    imported_packages: Dict[str, Any] = None,
+) -> float:
+    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    if use_int8_w8a16:
+        w1 = torch.randint(
+            -127,
+            127,
+            (
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+            ),
+            dtype=torch.int8,
+        )
+        w2 = torch.randint(
+            -127,
+            127,
+            (
+                num_experts,
+                hidden_size,
+                shard_intermediate_size // 2,
+            ),
+            dtype=torch.int8,
+        )
+    else:
+        w1 = torch.randn(
+            num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype
+        )
+        w2 = torch.randn(
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype
+        )
+    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    if use_int8_w8a16:
+        w1_scale = torch.randn(
+            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
+        )
+        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
+    if use_deep_gemm:
+        # we use the default block shape for deepgemm
+        block_quant_shape = [128, 128]
+    if use_fp8_w8a8:
+        if block_quant_shape:
+            block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+            E = num_experts
+            N = shard_intermediate_size // 2
+            K = hidden_size
+            factor_for_scale = 1e-2
+            n_tiles_w1 = (2 * N + block_n - 1) // block_n
+            n_tiles_w2 = (K + block_n - 1) // block_n
+            k_tiles_w1 = (K + block_k - 1) // block_k
+            k_tiles_w2 = (N + block_k - 1) // block_k
+            w1_scale = (
+                torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+                * factor_for_scale
+            )
+            w2_scale = (
+                torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+                * factor_for_scale
+            )
+        else:
+            w1_scale = torch.randn(num_experts, dtype=torch.float32)
+            w2_scale = torch.randn(num_experts, dtype=torch.float32)
+
+        a1_scale = torch.randn(1, dtype=torch.float32)
+        a2_scale = torch.randn(1, dtype=torch.float32)
+
+        w1 = w1.to(imported_packages["vllm_platforms"].current_platform.fp8_dtype())
+        w2 = w2.to(imported_packages["vllm_platforms"].current_platform.fp8_dtype())
+
+    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
+
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+
+    def run():
+        if use_fp8_w8a8:
+            quant_dtype = torch.float8_e4m3fn
+        elif use_int8_w8a16:
+            quant_dtype = torch.int8
+        else:
+            quant_dtype = None
+
+        quant_config = imported_packages["FusedMoEQuantConfig"].make(
+            quant_dtype=quant_dtype,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_quant_shape,
         )
+
+        with imported_packages["override_config"](config):
+            topk_weights, topk_ids, token_expert_indices = imported_packages["FusedMoE"].fused_topk(
+                x, input_gating, topk, renormalize=not use_deep_gemm
+            )
+            return imported_packages["FusedMoE"].fused_experts(
+                x,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                inplace=True,
+                quant_config=quant_config,
+                allow_deep_gemm=use_deep_gemm,
+            )
+
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg

From 3070db118fd2052abf973f89cb7c74c4af12b0ed Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Thu, 18 Dec 2025 16:27:04 +0000
Subject: [PATCH 04/17] feat: clean and simplify tuning and config saving

---
 src/pruna/algorithms/moe_kernel_tuner.py | 432 ++++++++++++++++-------
 1 file changed, 305 insertions(+), 127 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index 14b2a0df..6daaac57 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -21,22 +21,24 @@
 import time
 import torch
 from ConfigSpace import CategoricalHyperparameter, OrdinalHyperparameter
-from contextlib import nullcontext
 from datetime import datetime
 import os
 import json
 from itertools import product
+from importlib.util import find_spec
 
 from pruna.algorithms.base.pruna_base import PrunaAlgorithmBase
 from pruna.algorithms.base.tags import AlgorithmTag as tags
+from pruna.config.hyperparameters import UnconstrainedHyperparameter
 from pruna.config.smash_config import SmashConfigPrefixWrapper
+from pruna.engine.load import LOAD_FUNCTIONS
 from pruna.engine.model_checks import is_moe_lm, is_transformers_pipeline_with_moe_lm
 from pruna.logging.logger import pruna_logger
 
 
 class MoeKernelTuner(PrunaAlgorithmBase):
     """
-    Tune the MoE kernel for the model.
+    Tune the MoE Triton kernel for the model.
 
     Uses vLLM to tune the MoE kernel of the model.
     """
@@ -99,6 +101,28 @@ def get_hyperparameters(self) -> list:
                 default_value=1,
                 meta=dict(desc="Tensor parallel size to use if the model can not fit on a single GPU."),
             ),
+            UnconstrainedHyperparameter(
+                "path_to_huggingface_hub_cache",
+                default_value="~",
+                meta=dict(
+                    desc=(
+                        "Path to the Hugging Face Hub cache directory "
+                        "(that contains `kernels` configs). If not provided, "
+                        "the cache will be saved in the current working directory."
+                    )
+                ),
+            ),
+            UnconstrainedHyperparameter(
+                "path_to_vllm_cache",
+                default_value="vllm/model_executor/layers/fused_moe/configs",
+                meta=dict(desc="Path to the vLLM MoE configs directory."),
+            ),
+            OrdinalHyperparameter(
+                "num_iters",
+                sequence=[1, 20, 50, 100],
+                default_value=20,
+                meta=dict(desc="Number of iterations to average the kernel times on."),
+            ),
         ]
 
     def model_check_fn(self, model: Any) -> bool:
@@ -123,7 +147,7 @@ def model_check_fn(self, model: Any) -> bool:
 
     def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         """
-        Wrap the model to use flash_attn3 where possible.
+        Tune the MoE Triton kernel for the model.
 
         Parameters
         ----------
@@ -135,79 +159,33 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         Returns
         -------
         Any
-            The wrapped model.
+            The untouched model.
         """
+        if is_transformers_pipeline_with_moe_lm(model):
+            return self._apply_to_model_within_transformers_pipeline(model, smash_config)
+
         imported_packages = self.import_algorithm_packages()
 
-        @ray.remote(num_gpus=1)
-        class BenchmarkWorker:
-            def __init__(self, seed: int) -> None:
-                torch.set_default_device("cuda")
-                imported_packages["vllm_platforms"].current_platform.seed_everything(seed)
-                self.seed = seed
-                self.device_id = int(ray.get_gpu_ids()[0])
-
-            def tune(
-                self,
-                num_tokens: int,
-                num_experts: int,
-                shard_intermediate_size: int,
-                hidden_size: int,
-                topk: int,
-                dtype: torch.dtype,
-                use_fp8_w8a8: bool,
-                use_int8_w8a16: bool,
-                search_space: list[dict[str, int]],
-                block_quant_shape: list[int],
-                use_deep_gemm: bool,
-            ) -> dict[str, int]:
-                best_config = None
-                best_time = float("inf")
-
-                need_device_guard = False
-
-                with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
-                    for config in tqdm_ray(search_space):
-                        try:
-                            kernel_time = benchmark_config(
-                                config,
-                                num_tokens,
-                                num_experts,
-                                shard_intermediate_size,
-                                hidden_size,
-                                topk,
-                                dtype,
-                                use_fp8_w8a8,
-                                use_int8_w8a16,
-                                num_iters=20,
-                                block_quant_shape=block_quant_shape,
-                                use_deep_gemm=use_deep_gemm,
-                            )
-                        except imported_packages["triton"].runtime.autotuner.OutOfResources:
-                            # Some configurations may be invalid and fail to compile.
-                            continue
-
-                        if kernel_time < best_time:
-                            best_time = kernel_time
-                            best_config = config
-                now = datetime.now()
-                pruna_logger.info(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
-                assert best_config is not None
-                return best_config
-
-        E = model.num_experts if is_moe_lm(model)or is_transformers_pipeline_with_moe_lm(model) else model.num_experts                # number of experts
-        topk = model.num_experts_per_tok                    # number of active experts per token
-        intermediate_size = model.intermediate_size # 3072 # FFN intermediate size
-        hidden_size = model.hidden_size #4096        # model hidden dim
+        # (i) Get the MoE parameters
+        model_config = model.config
+        if model_config is None:
+            raise ValueError(f"Model {model.__class__.__name__} has no config.")
+        E = model_config.num_experts                # number of experts
+        topk = model_config.num_experts_per_tok if is_moe_lm(model) else model_config.moe_topk[0] # number of active experts per token
+        intermediate_size = model_config.intermediate_size # 3072 # FFN intermediate size
+        hidden_size = model_config.hidden_size #4096        # model hidden dim
         assert intermediate_size % smash_config["tensor_parallel_size"] == 0, (
             f"intermediate_size {intermediate_size} is not divisible by tp "
             f"{smash_config['tensor_parallel_size']}."
         )
         shard_intermediate_size = 2 * intermediate_size // smash_config["tensor_parallel_size"]
+        
+        # (ii) Get the compute parameters
         dtype = smash_config["compute_dtype"]
         use_fp8_w8a8 = smash_config["weight_dtype"] == "fp8_w8a8"
         use_int8_w8a16 = smash_config["weight_dtype"] == "int8_w8a16"
-        FP8_DTYPE = imported_packages["vllm_platforms"].current_platform.fp8_dtype()
+
+        # (iii) Tune the kernel over a range of batch sizes
         batch_sizes = [
             1,
             2,
@@ -230,20 +208,17 @@ def tune(
         ]
 
         ray.init()
-        num_gpus = int(ray.available_resources()["GPU"])
-        workers = [BenchmarkWorker.remote(0) for _ in range(num_gpus)]
 
         is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
-        search_space = get_configs_compute_bound(is_fp16, None)
+        search_space = get_configs_compute_bound(is_fp16)
         pruna_logger.info(f"Start tuning over {len(search_space)} configurations...")
 
         start = time.time()
         outputs = []
-        worker_idx = 0
         for batch_size in batch_sizes:
-            input_args = (
-                batch_size,
-                E,
+            output = tune.remote(
+                batch_size,                 # num_tokens
+                E,                          # num_experts
                 shard_intermediate_size,
                 hidden_size,
                 topk,
@@ -251,35 +226,47 @@ def tune(
                 use_fp8_w8a8,
                 use_int8_w8a16,
                 search_space,
-                None,
-                False,
+                None,                       # we don't suport block quantization for now
+                False,                      # not use_deep_gemm
+                imported_packages,
+                0,                          # random seed
+                smash_config["num_iters"],
             )
-            worker = workers[worker_idx]
-            worker_method = getattr(worker, "tune")
-            output = worker_method.remote(*input_args)
             outputs.append(output)
-            worker_idx = (worker_idx + 1) % num_gpus
+
         configs = ray.get(outputs)
 
+        # (iv) Sort the configs by batch size and save the best configs
         best_configs = {
             M: sort_config(config) for M, config in zip(batch_sizes, configs)
         }
-        self.save_configs(
+        # save configs in caches (for hf and vllm)
+        save_configs(
             best_configs,
             E,
             shard_intermediate_size,
-            hidden_size,
-            topk,
             dtype,
             use_fp8_w8a8,
             use_int8_w8a16,
             None,
-            args.save_dir,
+            smash_config["path_to_huggingface_hub_cache"],
+            smash_config["path_to_vllm_cache"],
             imported_packages,
         )
+        # attached configs to the smash config
+        smash_config["best_configs_moe_kernel"] = best_configs
+        # attached hyperparameters to the smash config for loading
+        smash_config["num_experts"] = E
+        smash_config["shard_intermediate_size"] = shard_intermediate_size
+        smash_config["dtype"] = dtype
+        smash_config["use_fp8_w8a8"] = use_fp8_w8a8
+        smash_config["use_int8_w8a16"] = use_int8_w8a16
+        # attached load function to the smash config for loading
+        smash_config.load_fns.append(LOAD_FUNCTIONS.moe_kernel_tuner.name)
         end = time.time()
         pruna_logger.info(f"Tuning took {end - start:.2f} seconds")
 
+        # (v) Return the model (untouched; only the configs are saved)
         return model
 
     def import_algorithm_packages(self) -> Dict[str, Any]:
@@ -299,6 +286,7 @@ def import_algorithm_packages(self) -> Dict[str, Any]:
         )
         from vllm.model_executor.layers.fused_moe import override_config
         from vllm.triton_utils import triton
+        import vllm.envs as envs
 
         return dict(
             FusedMoEQuantConfig=FusedMoEQuantConfig,
@@ -306,12 +294,26 @@ def import_algorithm_packages(self) -> Dict[str, Any]:
             FusedMoE=fused_moe,
             vllm_platforms=vllm_platforms,
             triton=triton,
-            tqdm_ray=tqdm_ray,
             override_config=override_config,
+            envs=envs,
         )
 
-    def save_configs(
-        configs: dict[int, BenchmarkConfig],
+class BenchmarkConfig(TypedDict):
+    """
+    The configuration for the matrix multiplication (tiling and warp scheduling).
+    """
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+# Converts the function into a Ray actor and requests one GPU per actor instance
+@ray.remote(num_gpus=1)
+def tune(
+        self,
+        num_tokens: int,
         num_experts: int,
         shard_intermediate_size: int,
         hidden_size: int,
@@ -319,43 +321,100 @@ def save_configs(
         dtype: torch.dtype,
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
+        search_space: list[dict[str, int]],
         block_quant_shape: list[int],
-        save_dir: str,
+        use_deep_gemm: bool,
         imported_packages: Dict[str, Any],
-    ) -> None:
-        dtype_str = imported_packages["_get_config_dtype_str"](
-            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
-        )
-
-        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-        # is the intermediate size after silu_and_mul.
-        filename = imported_packages["fused_moe"].get_config_file_name(
-            num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
-        )
-
-        # We want to save at 3 different places:
-        # 1. The cache of vllm
-        # 2. The cache of kernels hub
-        # 3. The smashconfig (to be reused once mode is smashed and saved).
+        seed: int,
+        num_iters: int,
+    ) -> dict[str, int]:
+    """
+    Tune a given Triton kernel.
+
+    Parameters
+    ----------
+    num_tokens: int
+        The number of tokens in the batch.
+    num_experts: int
+        The number of experts.
+    shard_intermediate_size: int
+        The intermediate size of the model in the shard (if using tensor parallelism).
+    hidden_size: int
+        The hidden size of the model.
+    topk: int
+        The number of active experts per token.
+    dtype: torch.dtype
+        The dtype to use for the weights and activations.
+    use_fp8_w8a8: bool
+        Whether to use fp8_w8a8.
+    use_int8_w8a16: bool
+        Whether to use int8_w8a16.
+    search_space: list[dict[str, int]]
+        The search space for the kernel (tiling and warp scheduling).
+    block_quant_shape: list[int]
+        The block shape for the kernel (None here).
+    use_deep_gemm: bool
+        Whether to use deep gemm (False here).
+    imported_packages: Dict[str, Any]
+        The imported packages (vllm, triton, etc.).
+    seed: int
+        The random seed.
+    num_iters: int
+        The number of iterations to average the kernel time on.
+
+    Returns
+    -------
+    dict[str, int]
+        The best config.
+    """
+    imported_packages["vllm_platforms"].current_platform.seed_everything(seed)
+    best_config = None
+    best_time = float("inf")
+
+    for config in tqdm_ray(search_space):
+        try:
+            kernel_time = benchmark_config(
+                config,
+                num_tokens,
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+                topk,
+                dtype,
+                use_fp8_w8a8,
+                use_int8_w8a16,
+                num_iters=num_iters,
+                block_quant_shape=block_quant_shape,
+                use_deep_gemm=use_deep_gemm,
+                imported_packages=imported_packages,
+                num_iters=num_iters,
+            )
+        except imported_packages["triton"].runtime.autotuner.OutOfResources:
+            # Some configurations may be invalid and fail to compile.
+            continue
 
+        if kernel_time < best_time:
+            best_time, best_config = kernel_time, config
 
-        os.makedirs(save_dir, exist_ok=True)
-        filename = os.path.join(save_dir, filename)
-        pruna_logger.info(f"Writing best config to {filename}...")
-        with open(filename, "w") as f:
-            json.dump({"triton_version": imported_packages["triton"].__version__, **configs}, f, indent=4)
-            f.write("\n")
+    now = datetime.now()
+    pruna_logger.info(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+    assert best_config is not None
+    return best_config
 
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
+    """
+    Sort the configuration (tiling and warp scheduling).
 
-class BenchmarkConfig(TypedDict):
-    BLOCK_SIZE_M: int
-    BLOCK_SIZE_N: int
-    BLOCK_SIZE_K: int
-    GROUP_SIZE_M: int
-    num_warps: int
-    num_stages: int
+    Parameters
+    ----------
+    config: BenchmarkConfig
+        The configuration to sort.
 
-def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
+    Returns
+    -------
+    BenchmarkConfig
+        The sorted configuration.
+    """
     return {
         "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
         "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
@@ -375,7 +434,20 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     }
 
 
-def get_configs_compute_bound(use_fp16, block_quant_shape) -> list[dict[str, int]]:
+def get_configs_compute_bound(use_fp16: bool) -> list[dict[str, int]]:
+    """
+    Get the gridsearch space for the kernel (tiling and warp scheduling).
+
+    Parameters
+    ----------
+    use_fp16: bool
+        Whether to use fp16.
+
+    Returns
+    -------
+    list[dict[str, int]]
+        The search space for the kernel (tiling and warp scheduling).
+    """
     configs: list[BenchmarkConfig] = []
 
     # Reduced search space for faster tuning.
@@ -400,17 +472,6 @@ def get_configs_compute_bound(use_fp16, block_quant_shape) -> list[dict[str, int
         config = dict(zip(keys, config_values))
         configs.append(config)
 
-    # Remove configs that are not compatible with fp8 block quantization
-    # BLOCK_SIZE_K must be a multiple of block_k
-    # BLOCK_SIZE_N must be a multiple of block_n
-    if block_quant_shape is not None and not use_fp16:
-        block_n, block_k = block_quant_shape[0], block_quant_shape[1]
-        for config in configs[:]:
-            if (
-                config["BLOCK_SIZE_K"] % block_k != 0
-                or config["BLOCK_SIZE_N"] % block_n != 0
-            ):
-                configs.remove(config)
     return configs
 
 
@@ -429,6 +490,46 @@ def benchmark_config(
     use_deep_gemm: bool = False,
     imported_packages: Dict[str, Any] = None,
 ) -> float:
+    """
+    Benchmark a given Triton kernel using CUDAGraph.
+
+    This function is copied from the vllm repository.
+    https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py
+
+    Parameters
+    ----------
+    config: BenchmarkConfig
+        The configuration to benchmark.
+    num_tokens: int
+        The number of tokens in the batch.
+    num_experts: int
+        The number of experts.
+    shard_intermediate_size: int
+        The intermediate size of the model in the shard (if using tensor parallelism).
+    hidden_size: int
+        The hidden size of the model.
+    topk: int
+        The number of active experts per token.
+    dtype: torch.dtype
+        The dtype to use for the weights and activations.
+    use_fp8_w8a8: bool
+        Whether to use fp8_w8a8.
+    use_int8_w8a16: bool
+        Whether to use int8_w8a16.
+    num_iters: int
+        The number of iterations to run the benchmark.
+    block_quant_shape: list[int]
+        The block shape for the kernel (None here).
+    use_deep_gemm: bool
+        Whether to use deep gemm (False here).
+    imported_packages: Dict[str, Any]
+        The imported packages (vllm, triton, etc.).
+
+    Returns
+    -------
+    float
+        The average latency of the kernel.
+    """
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     if use_int8_w8a16:
@@ -571,3 +672,80 @@ def run():
     avg = sum(latencies) / (num_iters * 10) * 1000  # us
     graph.reset()
     return avg
+
+
+def save_configs(
+    configs: dict[int, BenchmarkConfig],
+    num_experts: int,
+    shard_intermediate_size: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    block_quant_shape: list[int],
+    path_to_huggingface_hub_cache: str,
+    path_to_vllm_cache: str,
+    imported_packages: Dict[str, Any],
+) -> None:
+    """
+    Save the best configs to the hf cache and vllm cache.
+
+    Parameters
+    ----------
+    configs: dict[int, BenchmarkConfig]
+        The best configs.
+    num_experts: int
+        The number of experts.
+    shard_intermediate_size: int
+        The intermediate size of the model in the shard (if using tensor parallelism).
+    hidden_size: int
+        The hidden size of the model.
+    topk: int
+        The number of active experts per token.
+    dtype: torch.dtype
+        The dtype to use for the weights and activations.
+    use_fp8_w8a8: bool
+        Whether to use fp8_w8a8.
+    use_int8_w8a16: bool
+        Whether to use int8_w8a16.
+    block_quant_shape: list[int]
+        The block shape for the kernel (None here).
+    path_to_huggingface_hub_cache: str
+        The path to the huggingface hub cache.
+    path_to_vllm_cache: str
+        The path to the vllm cache.
+    imported_packages: Dict[str, Any]
+        The imported packages (vllm, triton, etc.).
+    """
+    dtype_str = imported_packages["_get_config_dtype_str"](
+        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+    )
+
+    # (i) Get the name of the config file
+    # NB from vllm: The current naming convention uses w2.shape[2], which
+    # is the intermediate size after silu_and_mul.
+    filename = imported_packages["fused_moe"].get_config_file_name(
+        num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
+    )
+
+    # (ii) Save the config to the hf cache (where `kernels` lib expects to find it)
+    path_to_kernel_configs = os.path.join(path_to_huggingface_hub_cache, ".cache/huggingface/hub/models--RedHatAI--moe/blobs/configs")
+    os.makedirs(path_to_kernel_configs, exist_ok=True)
+    filename = os.path.join(path_to_kernel_configs, filename)
+    if not os.path.exists(filename):
+        pruna_logger.info(f"Writing best config to {filename}...")
+        with open(filename, "w") as f:
+            json.dump({"triton_version": imported_packages["triton"].__version__, **configs}, f, indent=4)
+            f.write("\n")
+
+    # (iii) Save the config to the vllm cache (where `vllm` expects to find it)
+    path_to_vllm_configs = imported_packages["envs"].VLLM_TUNED_CONFIG_FOLDER
+    if path_to_vllm_configs is None:
+        path_where_vllm_is_installed = find_spec("vllm").submodule_search_locations[0]
+        path_to_vllm_configs = os.path.join(os.path.dirname(path_where_vllm_is_installed), path_to_vllm_cache, "configs")
+    os.makedirs(path_to_vllm_configs, exist_ok=True)
+    filename = os.path.join(path_to_vllm_configs, filename)
+    if not os.path.exists(filename):
+        pruna_logger.info(f"Writing best config to {filename}...")
+        with open(filename, "w") as f:
+            json.dump({"triton_version": imported_packages["triton"].__version__, **configs}, f, indent=4)
+            f.write("\n")

From 19a5518b491924fbe7df666e099eb491a400e6fb Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Thu, 18 Dec 2025 16:27:45 +0000
Subject: [PATCH 05/17] feat: add custom loading fn

---
 src/pruna/engine/load.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index 7380cf71..0e965409 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -544,6 +544,39 @@ def load_hqq_diffusers(path: str | Path, smash_config: SmashConfig, **kwargs) ->
     return model
 
 
+def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs) -> Any:
+    """
+    Load a tuned kernel config inside the hf/vllm cache, then load the model.
+
+    Parameters
+    ----------
+    path: str | Path
+        The path to the model directory.
+    smash_config: SmashConfig
+        The SmashConfig object containing the best configs for the MoE kernel tuner.
+    **kwargs: Any
+        Additional keyword arguments to pass to the model loading function.
+
+    Returns
+    -------
+    Any
+        The loaded model.
+    """
+    from pruna.algorithms.moe_kernel_tuner import MoEKernelTuner, save_configs
+    imported_packages = MoEKernelTuner().import_algorithm_packages()
+    save_configs(smash_config["best_configs_moe_kernel"],
+    smash_config["num_experts"],
+    smash_config["shard_intermediate_size"],
+    smash_config["dtype"],
+    smash_config["use_fp8_w8a8"],
+    smash_config["use_int8_w8a16"],
+    smash_config["block_quant_shape"],
+    smash_config["path_to_huggingface_hub_cache"],
+    smash_config["path_to_vllm_cache"],
+    imported_packages)
+    return load_transformers_model(path, smash_config, **kwargs)
+
+
 class LOAD_FUNCTIONS(Enum):  # noqa: N801
     """
     Enumeration of load functions for different model types.
@@ -578,6 +611,7 @@ class LOAD_FUNCTIONS(Enum):  # noqa: N801
     hqq = partial(load_hqq)
     hqq_diffusers = partial(load_hqq_diffusers)
     torch_artifacts = partial(load_torch_artifacts)
+    moe_kernel_tuner = partial(load_moe_kernel_tuner)
 
     def __call__(self, *args, **kwargs) -> Any:
         """

From eb98702049c29a8d42c87dec27f7ca0ed0e0d074 Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Thu, 18 Dec 2025 16:28:25 +0000
Subject: [PATCH 06/17] feat: add custom loading fn

---
 src/pruna/engine/load.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index 0e965409..3109b32d 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -563,17 +563,20 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
         The loaded model.
     """
     from pruna.algorithms.moe_kernel_tuner import MoEKernelTuner, save_configs
+
     imported_packages = MoEKernelTuner().import_algorithm_packages()
-    save_configs(smash_config["best_configs_moe_kernel"],
-    smash_config["num_experts"],
-    smash_config["shard_intermediate_size"],
-    smash_config["dtype"],
-    smash_config["use_fp8_w8a8"],
-    smash_config["use_int8_w8a16"],
-    smash_config["block_quant_shape"],
-    smash_config["path_to_huggingface_hub_cache"],
-    smash_config["path_to_vllm_cache"],
-    imported_packages)
+    save_configs(
+        smash_config["best_configs_moe_kernel"],
+        smash_config["num_experts"],
+        smash_config["shard_intermediate_size"],
+        smash_config["dtype"],
+        smash_config["use_fp8_w8a8"],
+        smash_config["use_int8_w8a16"],
+        smash_config["block_quant_shape"],
+        smash_config["path_to_huggingface_hub_cache"],
+        smash_config["path_to_vllm_cache"],
+        imported_packages,
+    )
     return load_transformers_model(path, smash_config, **kwargs)
 
 

From b2d600e21b0072fe598582d8504fd050cbe84497 Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Fri, 19 Dec 2025 16:58:11 +0000
Subject: [PATCH 07/17] feat: add unit test

---
 src/pruna/algorithms/moe_kernel_tuner.py     |  9 +++++----
 src/pruna/engine/load.py                     |  2 +-
 tests/algorithms/testers/moe_kernel_tuner.py | 13 +++++++++++++
 tests/fixtures.py                            |  1 +
 4 files changed, 20 insertions(+), 5 deletions(-)
 create mode 100644 tests/algorithms/testers/moe_kernel_tuner.py

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index 6daaac57..3c421201 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -40,7 +40,7 @@ class MoeKernelTuner(PrunaAlgorithmBase):
     """
     Tune the MoE Triton kernel for the model.
 
-    Uses vLLM to tune the MoE kernel of the model.
+    Uses vLLM to tune the MoE kernel.
     """
 
     algorithm_name: str = "moe_kernel_tuner"
@@ -172,8 +172,8 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
             raise ValueError(f"Model {model.__class__.__name__} has no config.")
         E = model_config.num_experts                # number of experts
         topk = model_config.num_experts_per_tok if is_moe_lm(model) else model_config.moe_topk[0] # number of active experts per token
-        intermediate_size = model_config.intermediate_size # 3072 # FFN intermediate size
-        hidden_size = model_config.hidden_size #4096        # model hidden dim
+        intermediate_size = model_config.intermediate_size # FFN intermediate size
+        hidden_size = model_config.hidden_size # model hidden dim
         assert intermediate_size % smash_config["tensor_parallel_size"] == 0, (
             f"intermediate_size {intermediate_size} is not divisible by tp "
             f"{smash_config['tensor_parallel_size']}."
@@ -207,6 +207,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
             4096,
         ]
 
+        # use ray to parallelize the tuning
         ray.init()
 
         is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
@@ -261,7 +262,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         smash_config["dtype"] = dtype
         smash_config["use_fp8_w8a8"] = use_fp8_w8a8
         smash_config["use_int8_w8a16"] = use_int8_w8a16
-        # attached load function to the smash config for loading
+        # attach load function to the smash config for loading
         smash_config.load_fns.append(LOAD_FUNCTIONS.moe_kernel_tuner.name)
         end = time.time()
         pruna_logger.info(f"Tuning took {end - start:.2f} seconds")
diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index 3109b32d..e04e8e03 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -546,7 +546,7 @@ def load_hqq_diffusers(path: str | Path, smash_config: SmashConfig, **kwargs) ->
 
 def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs) -> Any:
     """
-    Load a tuned kernel config inside the hf/vllm cache, then load the model.
+    Load a tuned kernel config inside the hf/vllm caches, then load the model.
 
     Parameters
     ----------
diff --git a/tests/algorithms/testers/moe_kernel_tuner.py b/tests/algorithms/testers/moe_kernel_tuner.py
new file mode 100644
index 00000000..5c4da5c8
--- /dev/null
+++ b/tests/algorithms/testers/moe_kernel_tuner.py
@@ -0,0 +1,13 @@
+from pruna.algorithms.moe_kernel_tuner import MoeKernelTuner
+
+from .base_tester import AlgorithmTesterBase
+
+
+class TestMoeKernelTuner(AlgorithmTesterBase):
+    """Test the MoeKernelTuner."""
+
+    models = ["qwen3_coder_tiny"]
+    reject_models = ["sd_tiny_random"]
+    allow_pickle_files = False
+    algorithm_class = MoeKernelTuner
+    metrics = ["perplexity"]
diff --git a/tests/fixtures.py b/tests/fixtures.py
index d5bd55d7..e959e517 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -197,4 +197,5 @@ def get_autoregressive_text_to_image_model(model_id: str) -> tuple[Any, SmashCon
     "wan_tiny_random": partial(get_diffusers_model, "pruna-test/wan-t2v-tiny-random", torch_dtype=torch.bfloat16),
     "flux_tiny": partial(get_diffusers_model, "pruna-test/tiny_flux", torch_dtype=torch.float16),
     "tiny_llama": partial(get_automodel_transformers, "pruna-test/tiny_llama", torch_dtype=torch.bfloat16),
+    "qwen3_coder_tiny": partial(get_automodel_transformers, "pruna-test/qwen3_coder_tiny"),
 }

From eeda9c8496244f30fba1f23b819fe6622314aeb2 Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Mon, 22 Dec 2025 18:09:13 +0000
Subject: [PATCH 08/17] feat: add vllm dep and upd torch version

---
 pyproject.toml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 44df84bd..a82074d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,8 +96,8 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "torch==2.7.0",
-    "torchvision==0.22.0",
+    "torch==2.8.0",
+    "torchvision==0.23.0",
     "torchmetrics[image]==1.7.4",
     "requests>=2.31.0",
     "transformers",
@@ -136,7 +136,8 @@ dependencies = [
     "aenum",
     "vbench-pruna; sys_platform != 'darwin'",
     "imageio-ffmpeg",
-    "jaxtyping"
+    "jaxtyping",
+    "vllm>=0.11.0",
 ]
 
 [project.optional-dependencies]

From 4de173e363c371f537729ef4238882787748747b Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Mon, 22 Dec 2025 18:10:51 +0000
Subject: [PATCH 09/17] feat: change smashconfig to save artifacts and reload
 it

---
 src/pruna/algorithms/moe_kernel_tuner.py | 149 +++++++++++++----------
 src/pruna/config/smash_config.py         |  46 ++++++-
 src/pruna/engine/load.py                 |  34 ++++--
 3 files changed, 160 insertions(+), 69 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index 3c421201..e03f2a20 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -91,8 +91,8 @@ def get_hyperparameters(self) -> list:
             ),
             CategoricalHyperparameter(
                 "weight_dtype",
-                choices=["fp8_w8a8", "int8_w8a16"],
-                default_value="fp8_w8a8",
+                choices=["fp16", "fp8_w8a8", "int8_w8a16"],
+                default_value="fp16",
                 meta=dict(desc="Dtype to use for the weights (and activations)."),
             ),
             OrdinalHyperparameter(
@@ -123,6 +123,24 @@ def get_hyperparameters(self) -> list:
                 default_value=20,
                 meta=dict(desc="Number of iterations to average the kernel times on."),
             ),
+            OrdinalHyperparameter(
+                "block_size_m_max",
+                sequence=[4, 5, 6, 7, 8, 9, 10],
+                default_value=8,
+                meta=dict(desc="Maximum (log) block size for tiling through input dimension."),
+            ),
+            OrdinalHyperparameter(
+                "block_size_n_max",
+                sequence=[5, 6, 7, 8, 9, 10],
+                default_value=8,
+                meta=dict(desc="Maximum (log) block size for tiling through output dimension."),
+            ),
+            OrdinalHyperparameter(
+                "block_size_k_max",
+                sequence=[6, 7, 8, 9, 10],
+                default_value=8,
+                meta=dict(desc="Maximum (log) block size for tiling through intermediate dimension."),
+            ),
         ]
 
     def model_check_fn(self, model: Any) -> bool:
@@ -182,36 +200,33 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         
         # (ii) Get the compute parameters
         dtype = smash_config["compute_dtype"]
+        if dtype == "bfloat16":
+            dtype = torch.bfloat16
+        else: # default to float16
+            dtype = torch.float16
         use_fp8_w8a8 = smash_config["weight_dtype"] == "fp8_w8a8"
         use_int8_w8a16 = smash_config["weight_dtype"] == "int8_w8a16"
 
         # (iii) Tune the kernel over a range of batch sizes
         batch_sizes = [
             1,
-            2,
-            4,
-            8,
-            16,
-            24,
-            32,
-            48,
-            64,
-            96,
-            128,
-            256,
-            512,
-            1024,
-            1536,
-            2048,
-            3072,
-            4096,
-        ]
+            2,]
+        #    4,
+        #    8,
+        #    16,
+        #    24,
+        #    32,
+        #    48,
+        #    64,
+        #    96,
+        #    128,
+        # ]
 
         # use ray to parallelize the tuning
         ray.init()
 
         is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
-        search_space = get_configs_compute_bound(is_fp16)
+        search_space = get_configs_compute_bound(is_fp16, smash_config)
         pruna_logger.info(f"Start tuning over {len(search_space)} configurations...")
 
         start = time.time()
@@ -254,14 +269,18 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
             smash_config["path_to_vllm_cache"],
             imported_packages,
         )
-        # attached configs to the smash config
-        smash_config["best_configs_moe_kernel"] = best_configs
-        # attached hyperparameters to the smash config for loading
-        smash_config["num_experts"] = E
-        smash_config["shard_intermediate_size"] = shard_intermediate_size
-        smash_config["dtype"] = dtype
-        smash_config["use_fp8_w8a8"] = use_fp8_w8a8
-        smash_config["use_int8_w8a16"] = use_int8_w8a16
+        # stash results in the SmashConfig for later loading (cannot add new hyperparams to ConfigSpace here)
+        payload = dict(
+            best_configs_moe_kernel=best_configs,
+            num_experts=E,
+            shard_intermediate_size=shard_intermediate_size,
+            dtype=dtype,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            block_quant_shape=None,
+        )
+        # store artifacts in SmashConfig so they persist across save/load
+        smash_config.artifacts["moe_kernel_tuner"] = payload
         # attach load function to the smash config for loading
         smash_config.load_fns.append(LOAD_FUNCTIONS.moe_kernel_tuner.name)
         end = time.time()
@@ -313,7 +332,6 @@ class BenchmarkConfig(TypedDict):
 # Converts the function into a Ray actor and requests one GPU per actor instance
 @ray.remote(num_gpus=1)
 def tune(
-        self,
         num_tokens: int,
         num_experts: int,
         shard_intermediate_size: int,
@@ -372,7 +390,7 @@ def tune(
     best_config = None
     best_time = float("inf")
 
-    for config in tqdm_ray(search_space):
+    for config in tqdm_ray.tqdm(search_space):
         try:
             kernel_time = benchmark_config(
                 config,
@@ -388,7 +406,6 @@ def tune(
                 block_quant_shape=block_quant_shape,
                 use_deep_gemm=use_deep_gemm,
                 imported_packages=imported_packages,
-                num_iters=num_iters,
             )
         except imported_packages["triton"].runtime.autotuner.OutOfResources:
             # Some configurations may be invalid and fail to compile.
@@ -435,7 +452,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     }
 
 
-def get_configs_compute_bound(use_fp16: bool) -> list[dict[str, int]]:
+def get_configs_compute_bound(use_fp16: bool, smash_config: SmashConfigPrefixWrapper) -> list[dict[str, int]]:
     """
     Get the gridsearch space for the kernel (tiling and warp scheduling).
 
@@ -443,7 +460,8 @@ def get_configs_compute_bound(use_fp16: bool) -> list[dict[str, int]]:
     ----------
     use_fp16: bool
         Whether to use fp16.
-
+    smash_config: SmashConfigPrefixWrapper
+        The Smash configuration.
     Returns
     -------
     list[dict[str, int]]
@@ -452,9 +470,9 @@ def get_configs_compute_bound(use_fp16: bool) -> list[dict[str, int]]:
     configs: list[BenchmarkConfig] = []
 
     # Reduced search space for faster tuning.
-    block_m_range = [16, 32, 64, 128, 256]
-    block_n_range = [32, 64, 128, 256]
-    block_k_range = [64, 128, 256]
+    block_m_range = [2**i for i in range(4, smash_config["block_size_m_max"]+1)]
+    block_n_range = [2**i for i in range(5, smash_config["block_size_n_max"]+1)]
+    block_k_range = [2**i for i in range(6, smash_config["block_size_k_max"]+1)]
     num_warps_range = [4, 8]
     group_m_range = [1, 16, 32, 64]
     num_stage_range = [2, 3, 4, 5]
@@ -531,8 +549,14 @@ def benchmark_config(
     float
         The average latency of the kernel.
     """
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for MoeKernelTuner.")
+    # Ray sets CUDA_VISIBLE_DEVICES per worker to the GPU it scheduled
+    torch.cuda.set_device(0)
+    device = torch.device("cuda")
+
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
     if use_int8_w8a16:
         w1 = torch.randint(
             -127,
@@ -543,6 +567,7 @@ def benchmark_config(
                 hidden_size,
             ),
             dtype=torch.int8,
+            device=device,
         )
         w2 = torch.randint(
             -127,
@@ -553,15 +578,16 @@ def benchmark_config(
                 shard_intermediate_size // 2,
             ),
             dtype=torch.int8,
+            device=device,
         )
     else:
         w1 = torch.randn(
-            num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype
+            num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype, device=device
         )
         w2 = torch.randn(
-            num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype, device=device
         )
-    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
+    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32, device=device)
 
     w1_scale = None
     w2_scale = None
@@ -569,9 +595,9 @@ def benchmark_config(
     a2_scale = None
     if use_int8_w8a16:
         w1_scale = torch.randn(
-            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
+            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32, device=device
         )
-        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
+        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32, device=device)
     if use_deep_gemm:
         # we use the default block shape for deepgemm
         block_quant_shape = [128, 128]
@@ -587,24 +613,24 @@ def benchmark_config(
             k_tiles_w1 = (K + block_k - 1) // block_k
             k_tiles_w2 = (N + block_k - 1) // block_k
             w1_scale = (
-                torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+                torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32, device=device)
                 * factor_for_scale
             )
             w2_scale = (
-                torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+                torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32, device=device)
                 * factor_for_scale
             )
         else:
-            w1_scale = torch.randn(num_experts, dtype=torch.float32)
-            w2_scale = torch.randn(num_experts, dtype=torch.float32)
+            w1_scale = torch.randn(num_experts, dtype=torch.float32, device=device)
+            w2_scale = torch.randn(num_experts, dtype=torch.float32, device=device)
 
-        a1_scale = torch.randn(1, dtype=torch.float32)
-        a2_scale = torch.randn(1, dtype=torch.float32)
+        a1_scale = torch.randn(1, dtype=torch.float32, device=device)
+        a2_scale = torch.randn(1, dtype=torch.float32, device=device)
 
-        w1 = w1.to(imported_packages["vllm_platforms"].current_platform.fp8_dtype())
-        w2 = w2.to(imported_packages["vllm_platforms"].current_platform.fp8_dtype())
+        w1 = w1.to(device=device, dtype=imported_packages["vllm_platforms"].current_platform.fp8_dtype())
+        w2 = w2.to(device=device, dtype=imported_packages["vllm_platforms"].current_platform.fp8_dtype())
 
-    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
+    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32, device=device)
 
     def prepare(i: int):
         input_gating.copy_(gating_output[i])
@@ -662,6 +688,7 @@ def run():
 
     latencies: list[float] = []
     for i in range(num_iters):
+        print(f"Iteration {i} of {num_iters}")
         prepare(i)
         torch.cuda.synchronize()
 
@@ -724,17 +751,17 @@ def save_configs(
     # (i) Get the name of the config file
     # NB from vllm: The current naming convention uses w2.shape[2], which
     # is the intermediate size after silu_and_mul.
-    filename = imported_packages["fused_moe"].get_config_file_name(
+    filename = imported_packages["FusedMoE"].get_config_file_name(
         num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
     )
 
     # (ii) Save the config to the hf cache (where `kernels` lib expects to find it)
     path_to_kernel_configs = os.path.join(path_to_huggingface_hub_cache, ".cache/huggingface/hub/models--RedHatAI--moe/blobs/configs")
     os.makedirs(path_to_kernel_configs, exist_ok=True)
-    filename = os.path.join(path_to_kernel_configs, filename)
-    if not os.path.exists(filename):
-        pruna_logger.info(f"Writing best config to {filename}...")
-        with open(filename, "w") as f:
+    filename_hf = os.path.join(path_to_kernel_configs, filename)
+    if not os.path.exists(filename_hf):
+        pruna_logger.info(f"Writing best config to {filename_hf}...")
+        with open(filename_hf, "w") as f:
             json.dump({"triton_version": imported_packages["triton"].__version__, **configs}, f, indent=4)
             f.write("\n")
 
@@ -744,9 +771,9 @@ def save_configs(
         path_where_vllm_is_installed = find_spec("vllm").submodule_search_locations[0]
         path_to_vllm_configs = os.path.join(os.path.dirname(path_where_vllm_is_installed), path_to_vllm_cache, "configs")
     os.makedirs(path_to_vllm_configs, exist_ok=True)
-    filename = os.path.join(path_to_vllm_configs, filename)
-    if not os.path.exists(filename):
-        pruna_logger.info(f"Writing best config to {filename}...")
-        with open(filename, "w") as f:
+    filename_vllm = os.path.join(path_to_vllm_configs, filename)
+    if not os.path.exists(filename_vllm):
+        pruna_logger.info(f"Writing best config to {filename_vllm}...")
+        with open(filename_vllm, "w") as f:
             json.dump({"triton_version": imported_packages["triton"].__version__, **configs}, f, indent=4)
             f.write("\n")
diff --git a/src/pruna/config/smash_config.py b/src/pruna/config/smash_config.py
index 9d1ae6b0..3a679577 100644
--- a/src/pruna/config/smash_config.py
+++ b/src/pruna/config/smash_config.py
@@ -39,6 +39,7 @@
     "device",
     "device_map",
     "cache_dir",
+    "artifacts",
     "save_fns",
     "load_fns",
     "reapply_after_load",
@@ -101,6 +102,9 @@ def __init__(
         # internal variable to indicated that a model has been smashed for a specific batch size
         self.__locked_batch_size = False
 
+        # generic container for algorithm-produced artifacts that should be saved with the config
+        self.artifacts: dict[str, Any] = {}
+
         # ensure the cache directory is deleted on program exit
         atexit.register(self.cleanup_cache_dir)
 
@@ -313,7 +317,10 @@ def save_to_json(self, path: str | Path) -> None:
             config_dict[key] = convert_numpy_types(value)
 
         for name in ADDITIONAL_ARGS:
-            config_dict[name] = getattr(self, name)
+            value = getattr(self, name)
+            if name == "artifacts":
+                value = convert_artifacts_for_json(value)
+            config_dict[name] = value
 
         # do not save the old cache directory or device
         if "cache_dir" in config_dict:
@@ -747,3 +754,40 @@ def convert_numpy_types(input_value: Any) -> Any:
         return input_value.item()
     else:
         return input_value
+
+
+def convert_artifacts_for_json(value: Any) -> Any:
+    """
+    Convert artifacts to JSON-serializable forms.
+
+    - torch.dtype -> its string name (e.g., 'float16', 'bfloat16')
+    - Path -> str
+    - sets/tuples -> lists
+    - recursively handle dicts/lists
+
+    Parameters
+    ----------
+    value : Any
+        The value to convert.
+
+    Returns
+    -------
+    Any
+        The converted value.
+    """
+    if isinstance(value, dict):
+        return {k: convert_artifacts_for_json(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [convert_artifacts_for_json(v) for v in value]
+    if isinstance(value, tuple) or isinstance(value, set):
+        return [convert_artifacts_for_json(v) for v in value]
+    if isinstance(value, torch.dtype):
+        # map to canonical string
+        if value == getattr(torch, "float16", None):
+            return "float16"
+        if value == getattr(torch, "bfloat16", None):
+            return "bfloat16"
+        return str(value)
+    if isinstance(value, Path):
+        return str(value)
+    return value
diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index e04e8e03..b893cbfc 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -565,18 +565,38 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
     from pruna.algorithms.moe_kernel_tuner import MoEKernelTuner, save_configs
 
     imported_packages = MoEKernelTuner().import_algorithm_packages()
+    payload = getattr(smash_config, "artifacts", {}).get("moe_kernel_tuner")
+    if not payload:
+        pruna_logger.error(
+            "MoE kernel tuner artifacts not found in SmashConfig. "
+            "Ensure the tuner ran successfully before saving/loading."
+        )
+    best_configs = payload["best_configs_moe_kernel"]
+    num_experts = payload["num_experts"]
+    shard_intermediate_size = payload["shard_intermediate_size"]
+    dtype = payload["dtype"]
+    # Convert dtype string back to torch.dtype if needed
+    if dtype == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float16
+    use_fp8_w8a8 = payload["use_fp8_w8a8"]
+    use_int8_w8a16 = payload["use_int8_w8a16"]
+    block_quant_shape = payload["block_quant_shape"]
+
     save_configs(
-        smash_config["best_configs_moe_kernel"],
-        smash_config["num_experts"],
-        smash_config["shard_intermediate_size"],
-        smash_config["dtype"],
-        smash_config["use_fp8_w8a8"],
-        smash_config["use_int8_w8a16"],
-        smash_config["block_quant_shape"],
+        best_configs,
+        num_experts,
+        shard_intermediate_size,
+        dtype,
+        use_fp8_w8a8,
+        use_int8_w8a16,
+        block_quant_shape,
         smash_config["path_to_huggingface_hub_cache"],
         smash_config["path_to_vllm_cache"],
         imported_packages,
     )
+    smash_config.load_fns.remove(LOAD_FUNCTIONS.moe_kernel_tuner.name)
     return load_transformers_model(path, smash_config, **kwargs)
 
 

From 1314089dcac5e887fcd7a6c9c9a5e58fac694f4b Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Tue, 23 Dec 2025 10:34:11 +0000
Subject: [PATCH 10/17] fix: adapt parameter names inside smashconfig

---
 src/pruna/engine/load.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index b893cbfc..5573dc8e 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -562,9 +562,9 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
     Any
         The loaded model.
     """
-    from pruna.algorithms.moe_kernel_tuner import MoEKernelTuner, save_configs
+    from pruna.algorithms.moe_kernel_tuner import MoeKernelTuner, save_configs
 
-    imported_packages = MoEKernelTuner().import_algorithm_packages()
+    imported_packages = MoeKernelTuner().import_algorithm_packages()
     payload = getattr(smash_config, "artifacts", {}).get("moe_kernel_tuner")
     if not payload:
         pruna_logger.error(
@@ -584,6 +584,7 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
     use_int8_w8a16 = payload["use_int8_w8a16"]
     block_quant_shape = payload["block_quant_shape"]
 
+    # save the config attached to smash_config, inside the hf and vllm caches.
     save_configs(
         best_configs,
         num_experts,
@@ -592,8 +593,8 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
         use_fp8_w8a8,
         use_int8_w8a16,
         block_quant_shape,
-        smash_config["path_to_huggingface_hub_cache"],
-        smash_config["path_to_vllm_cache"],
+        smash_config["moe_kernel_tuner_path_to_huggingface_hub_cache"],
+        smash_config["moe_kernel_tuner_path_to_vllm_cache"],
         imported_packages,
     )
     smash_config.load_fns.remove(LOAD_FUNCTIONS.moe_kernel_tuner.name)

From 3eb111ebc62727e94fbdcb4286d51ebc20a25e4b Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Tue, 23 Dec 2025 10:35:57 +0000
Subject: [PATCH 11/17] fix: moe intermediate size can differ from model
 intermediate size

---
 src/pruna/algorithms/moe_kernel_tuner.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index e03f2a20..27ed73cf 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -190,7 +190,12 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
             raise ValueError(f"Model {model.__class__.__name__} has no config.")
         E = model_config.num_experts                # number of experts
         topk = model_config.num_experts_per_tok if is_moe_lm(model) else model_config.moe_topk[0] # number of active experts per token
-        intermediate_size = model_config.intermediate_size # FFN intermediate size
+        # qwen_moe can use different intermediate size compared to mixtral.
+        intermediate_size = (
+            model_config.moe_intermediate_size
+            if model_config.moe_intermediate_size is not None
+            else model_config.intermediate_size
+        )
         hidden_size = model_config.hidden_size # model hidden dim
         assert intermediate_size % smash_config["tensor_parallel_size"] == 0, (
             f"intermediate_size {intermediate_size} is not divisible by tp "
@@ -208,19 +213,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         use_int8_w8a16 = smash_config["weight_dtype"] == "int8_w8a16"
 
         # (iii) Tune the kernel over a range of batch sizes
-        batch_sizes = [
-            1,
-            2,]
-        #    4,
-        #    8,
-        #    16,
-        #    24,
-        #    32,
-        #    48,
-        #    64,
-        #    96,
-        #    128,
-        # ]
+        batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
 
         # use ray to parallelize the tuning
         ray.init()
@@ -769,7 +762,7 @@ def save_configs(
     path_to_vllm_configs = imported_packages["envs"].VLLM_TUNED_CONFIG_FOLDER
     if path_to_vllm_configs is None:
         path_where_vllm_is_installed = find_spec("vllm").submodule_search_locations[0]
-        path_to_vllm_configs = os.path.join(os.path.dirname(path_where_vllm_is_installed), path_to_vllm_cache, "configs")
+        path_to_vllm_configs = os.path.join(os.path.dirname(path_where_vllm_is_installed), path_to_vllm_cache)
     os.makedirs(path_to_vllm_configs, exist_ok=True)
     filename_vllm = os.path.join(path_to_vllm_configs, filename)
     if not os.path.exists(filename_vllm):

From 57642740f137542d11aff9746f5b08ad83cf0c30 Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Tue, 23 Dec 2025 16:57:01 +0000
Subject: [PATCH 12/17] fix: adapt xformers version to fit new torch version

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a82074d8..a23171ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -156,8 +156,8 @@ gptq = [
     "gptqmodel; sys_platform == 'darwin' and platform_machine == 'arm64'",
 ]
 full = [
-    "xformers==0.0.30",
-    "stable-fast-pruna==1.0.7",
+    "xformers>=0.0.30",
+    "stable-fast-pruna>=1.0.7",
 ]
 dev = [
     "wget",

From 85f015e3774c4ad47d4507e9ee572ee11a434053 Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Tue, 23 Dec 2025 17:14:41 +0000
Subject: [PATCH 13/17] feat: uv tries to resolve even for extra dependencies
 in the ci

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a23171ca..8066ebf4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -142,8 +142,8 @@ dependencies = [
 
 [project.optional-dependencies]
 stable-fast = [
-    "xformers==0.0.30",
-    "stable-fast-pruna==1.0.7",
+    "xformers>=0.0.30",
+    "stable-fast-pruna>=1.0.7",
 ]
 # dependencies are added here because the wheels aren't bundling them
 gptq = [

From 9322957b43b05cc4f6f8a6a58cf4b774b4c1ad9b Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Tue, 23 Dec 2025 17:43:34 +0000
Subject: [PATCH 14/17] feat: ruff linting

---
 src/pruna/algorithms/moe_kernel_tuner.py | 95 +++++++++++++-----------
 src/pruna/config/smash_config.py         |  2 +-
 src/pruna/engine/load.py                 |  5 +-
 3 files changed, 53 insertions(+), 49 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index 27ed73cf..ea710591 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -13,19 +13,19 @@
 # limitations under the License.
 from __future__ import annotations
 
+import json
+import pathlib
+import time
 from collections.abc import Iterable
+from datetime import datetime
+from importlib.util import find_spec
+from itertools import product
 from typing import Any, Dict, TypedDict
 
-import ray.experimental.tqdm_ray as tqdm_ray
 import ray
-import time
+import ray.experimental.tqdm_ray as tqdm_ray
 import torch
 from ConfigSpace import CategoricalHyperparameter, OrdinalHyperparameter
-from datetime import datetime
-import os
-import json
-from itertools import product
-from importlib.util import find_spec
 
 from pruna.algorithms.base.pruna_base import PrunaAlgorithmBase
 from pruna.algorithms.base.tags import AlgorithmTag as tags
@@ -188,27 +188,29 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         model_config = model.config
         if model_config is None:
             raise ValueError(f"Model {model.__class__.__name__} has no config.")
-        E = model_config.num_experts                # number of experts
-        topk = model_config.num_experts_per_tok if is_moe_lm(model) else model_config.moe_topk[0] # number of active experts per token
+        nb_experts = model_config.num_experts                # number of experts
+        # number of active experts per token
+        topk = (
+            model_config.num_experts_per_tok
+            if is_moe_lm(model)
+            else model_config.moe_topk[0]
+        )
         # qwen_moe can use different intermediate size compared to mixtral.
         intermediate_size = (
             model_config.moe_intermediate_size
             if model_config.moe_intermediate_size is not None
             else model_config.intermediate_size
         )
-        hidden_size = model_config.hidden_size # model hidden dim
+        hidden_size = model_config.hidden_size  # model hidden dim
         assert intermediate_size % smash_config["tensor_parallel_size"] == 0, (
             f"intermediate_size {intermediate_size} is not divisible by tp "
             f"{smash_config['tensor_parallel_size']}."
         )
         shard_intermediate_size = 2 * intermediate_size // smash_config["tensor_parallel_size"]
-        
+
         # (ii) Get the compute parameters
         dtype = smash_config["compute_dtype"]
-        if dtype == "bfloat16":
-            dtype = torch.bfloat16
-        else: # default to float16
-            dtype = torch.float16
+        dtype = torch.bfloat16 if dtype == "bfloat16" else torch.float16
         use_fp8_w8a8 = smash_config["weight_dtype"] == "fp8_w8a8"
         use_int8_w8a16 = smash_config["weight_dtype"] == "int8_w8a16"
 
@@ -227,7 +229,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         for batch_size in batch_sizes:
             output = tune.remote(
                 batch_size,                 # num_tokens
-                E,                          # num_experts
+                nb_experts,                          # num_experts per block
                 shard_intermediate_size,
                 hidden_size,
                 topk,
@@ -252,7 +254,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         # save configs in caches (for hf and vllm)
         save_configs(
             best_configs,
-            E,
+            nb_experts,
             shard_intermediate_size,
             dtype,
             use_fp8_w8a8,
@@ -265,7 +267,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         # stash results in the SmashConfig for later loading (cannot add new hyperparams to ConfigSpace here)
         payload = dict(
             best_configs_moe_kernel=best_configs,
-            num_experts=E,
+            num_experts=nb_experts,
             shard_intermediate_size=shard_intermediate_size,
             dtype=dtype,
             use_fp8_w8a8=use_fp8_w8a8,
@@ -291,15 +293,15 @@ def import_algorithm_packages(self) -> Dict[str, Any]:
         Dict[str, Any]
             The algorithm packages.
         """
+        import vllm.envs as envs
         import vllm.model_executor.layers.fused_moe.fused_moe as fused_moe
         import vllm.platforms as vllm_platforms
+        from vllm.model_executor.layers.fused_moe import override_config
         from vllm.model_executor.layers.fused_moe.config import (
             FusedMoEQuantConfig,
             _get_config_dtype_str,
         )
-        from vllm.model_executor.layers.fused_moe import override_config
         from vllm.triton_utils import triton
-        import vllm.envs as envs
 
         return dict(
             FusedMoEQuantConfig=FusedMoEQuantConfig,
@@ -311,10 +313,10 @@ def import_algorithm_packages(self) -> Dict[str, Any]:
             envs=envs,
         )
 
+
 class BenchmarkConfig(TypedDict):
-    """
-    The configuration for the matrix multiplication (tiling and warp scheduling).
-    """
+    """The configuration for the matrix multiplication (tiling and warp scheduling)."""
+
     BLOCK_SIZE_M: int
     BLOCK_SIZE_N: int
     BLOCK_SIZE_K: int
@@ -322,6 +324,7 @@ class BenchmarkConfig(TypedDict):
     num_warps: int
     num_stages: int
 
+
 # Converts the function into a Ray actor and requests one GPU per actor instance
 @ray.remote(num_gpus=1)
 def tune(
@@ -412,6 +415,7 @@ def tune(
     assert best_config is not None
     return best_config
 
+
 def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     """
     Sort the configuration (tiling and warp scheduling).
@@ -455,6 +459,7 @@ def get_configs_compute_bound(use_fp16: bool, smash_config: SmashConfigPrefixWra
         Whether to use fp16.
     smash_config: SmashConfigPrefixWrapper
         The Smash configuration.
+
     Returns
     -------
     list[dict[str, int]]
@@ -463,9 +468,9 @@ def get_configs_compute_bound(use_fp16: bool, smash_config: SmashConfigPrefixWra
     configs: list[BenchmarkConfig] = []
 
     # Reduced search space for faster tuning.
-    block_m_range = [2**i for i in range(4, smash_config["block_size_m_max"]+1)]
-    block_n_range = [2**i for i in range(5, smash_config["block_size_n_max"]+1)]
-    block_k_range = [2**i for i in range(6, smash_config["block_size_k_max"]+1)]
+    block_m_range = [2**i for i in range(4, smash_config["block_size_m_max"] + 1)]
+    block_n_range = [2**i for i in range(5, smash_config["block_size_n_max"] + 1)]
+    block_k_range = [2**i for i in range(6, smash_config["block_size_k_max"] + 1)]
     num_warps_range = [4, 8]
     group_m_range = [1, 16, 32, 64]
     num_stage_range = [2, 3, 4, 5]
@@ -597,20 +602,20 @@ def benchmark_config(
     if use_fp8_w8a8:
         if block_quant_shape:
             block_n, block_k = block_quant_shape[0], block_quant_shape[1]
-            E = num_experts
-            N = shard_intermediate_size // 2
-            K = hidden_size
+            e = num_experts
+            n = shard_intermediate_size // 2
+            k = hidden_size
             factor_for_scale = 1e-2
-            n_tiles_w1 = (2 * N + block_n - 1) // block_n
-            n_tiles_w2 = (K + block_n - 1) // block_n
-            k_tiles_w1 = (K + block_k - 1) // block_k
-            k_tiles_w2 = (N + block_k - 1) // block_k
+            n_tiles_w1 = (2 * n + block_n - 1) // block_n
+            n_tiles_w2 = (k + block_n - 1) // block_n
+            k_tiles_w1 = (k + block_k - 1) // block_k
+            k_tiles_w2 = (n + block_k - 1) // block_k
             w1_scale = (
-                torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32, device=device)
+                torch.rand((e, n_tiles_w1, k_tiles_w1), dtype=torch.float32, device=device)
                 * factor_for_scale
             )
             w2_scale = (
-                torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32, device=device)
+                torch.rand((e, n_tiles_w2, k_tiles_w2), dtype=torch.float32, device=device)
                 * factor_for_scale
             )
         else:
@@ -681,7 +686,6 @@ def run():
 
     latencies: list[float] = []
     for i in range(num_iters):
-        print(f"Iteration {i} of {num_iters}")
         prepare(i)
         torch.cuda.synchronize()
 
@@ -749,10 +753,13 @@ def save_configs(
     )
 
     # (ii) Save the config to the hf cache (where `kernels` lib expects to find it)
-    path_to_kernel_configs = os.path.join(path_to_huggingface_hub_cache, ".cache/huggingface/hub/models--RedHatAI--moe/blobs/configs")
-    os.makedirs(path_to_kernel_configs, exist_ok=True)
-    filename_hf = os.path.join(path_to_kernel_configs, filename)
-    if not os.path.exists(filename_hf):
+    path_to_kernel_configs = (
+        pathlib.Path(path_to_huggingface_hub_cache) /
+        ".cache/huggingface/hub/models--RedHatAI--moe/blobs/configs"
+    )
+    pathlib.Path(path_to_kernel_configs).mkdir(exist_ok=True, parents=True)
+    filename_hf = path_to_kernel_configs / filename
+    if not pathlib.Path(filename_hf).exists():
         pruna_logger.info(f"Writing best config to {filename_hf}...")
         with open(filename_hf, "w") as f:
             json.dump({"triton_version": imported_packages["triton"].__version__, **configs}, f, indent=4)
@@ -762,10 +769,10 @@ def save_configs(
     path_to_vllm_configs = imported_packages["envs"].VLLM_TUNED_CONFIG_FOLDER
     if path_to_vllm_configs is None:
         path_where_vllm_is_installed = find_spec("vllm").submodule_search_locations[0]
-        path_to_vllm_configs = os.path.join(os.path.dirname(path_where_vllm_is_installed), path_to_vllm_cache)
-    os.makedirs(path_to_vllm_configs, exist_ok=True)
-    filename_vllm = os.path.join(path_to_vllm_configs, filename)
-    if not os.path.exists(filename_vllm):
+        path_to_vllm_configs = pathlib.Path(path_where_vllm_is_installed).parent / path_to_vllm_cache
+    pathlib.Path(path_to_vllm_configs).mkdir(exist_ok=True, parents=True)
+    filename_vllm = path_to_vllm_configs / filename
+    if not pathlib.Path(filename_vllm).exists():
         pruna_logger.info(f"Writing best config to {filename_vllm}...")
         with open(filename_vllm, "w") as f:
             json.dump({"triton_version": imported_packages["triton"].__version__, **configs}, f, indent=4)
diff --git a/src/pruna/config/smash_config.py b/src/pruna/config/smash_config.py
index 3a679577..b57b6149 100644
--- a/src/pruna/config/smash_config.py
+++ b/src/pruna/config/smash_config.py
@@ -779,7 +779,7 @@ def convert_artifacts_for_json(value: Any) -> Any:
         return {k: convert_artifacts_for_json(v) for k, v in value.items()}
     if isinstance(value, list):
         return [convert_artifacts_for_json(v) for v in value]
-    if isinstance(value, tuple) or isinstance(value, set):
+    if isinstance(value, (tuple, set)):
         return [convert_artifacts_for_json(v) for v in value]
     if isinstance(value, torch.dtype):
         # map to canonical string
diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index 5573dc8e..6c528551 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -576,10 +576,7 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
     shard_intermediate_size = payload["shard_intermediate_size"]
     dtype = payload["dtype"]
     # Convert dtype string back to torch.dtype if needed
-    if dtype == "bfloat16":
-        dtype = torch.bfloat16
-    else:
-        dtype = torch.float16
+    dtype = torch.bfloat16 if dtype == "bfloat16" else torch.float16
     use_fp8_w8a8 = payload["use_fp8_w8a8"]
     use_int8_w8a16 = payload["use_int8_w8a16"]
     block_quant_shape = payload["block_quant_shape"]

From f7daf144c7ada4ee974bd8c443ff01b4e2de986e Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Tue, 23 Dec 2025 18:07:33 +0000
Subject: [PATCH 15/17] feat: ty check linting

---
 src/pruna/algorithms/moe_kernel_tuner.py | 13 +++---
 src/pruna/engine/load.py                 | 50 ++++++++++++------------
 2 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index ea710591..49562d71 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -272,7 +272,6 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
             dtype=dtype,
             use_fp8_w8a8=use_fp8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
-            block_quant_shape=None,
         )
         # store artifacts in SmashConfig so they persist across save/load
         smash_config.artifacts["moe_kernel_tuner"] = payload
@@ -438,14 +437,14 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
         "num_warps": config["num_warps"],
         "num_stages": config["num_stages"],
         **(
-            {"waves_per_eu": config["waves_per_eu"]} if "waves_per_eu" in config else {}
+            {"waves_per_eu": config.get("waves_per_eu")} if "waves_per_eu" in config else {}
         ),
         **(
-            {"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]}
+            {"matrix_instr_nonkdim": config.get("matrix_instr_nonkdim")}
             if "matrix_instr_nonkdim" in config
             else {}
         ),
-        **({"kpack": config["kpack"]} if "kpack" in config else {}),
+        **({"kpack": config.get("kpack")} if "kpack" in config else {}),
     }
 
 
@@ -768,7 +767,11 @@ def save_configs(
     # (iii) Save the config to the vllm cache (where `vllm` expects to find it)
     path_to_vllm_configs = imported_packages["envs"].VLLM_TUNED_CONFIG_FOLDER
     if path_to_vllm_configs is None:
-        path_where_vllm_is_installed = find_spec("vllm").submodule_search_locations[0]
+        submodule_locations = find_spec("vllm").submodule_search_locations
+        if submodule_locations is not None and len(submodule_locations) > 0:
+            path_where_vllm_is_installed = submodule_locations[0]
+        else:
+            raise RuntimeError("Could not determine installation path for vllm.")
         path_to_vllm_configs = pathlib.Path(path_where_vllm_is_installed).parent / path_to_vllm_cache
     pathlib.Path(path_to_vllm_configs).mkdir(exist_ok=True, parents=True)
     filename_vllm = path_to_vllm_configs / filename
diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index 6c528551..993a100b 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -571,31 +571,31 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
             "MoE kernel tuner artifacts not found in SmashConfig. "
             "Ensure the tuner ran successfully before saving/loading."
         )
-    best_configs = payload["best_configs_moe_kernel"]
-    num_experts = payload["num_experts"]
-    shard_intermediate_size = payload["shard_intermediate_size"]
-    dtype = payload["dtype"]
-    # Convert dtype string back to torch.dtype if needed
-    dtype = torch.bfloat16 if dtype == "bfloat16" else torch.float16
-    use_fp8_w8a8 = payload["use_fp8_w8a8"]
-    use_int8_w8a16 = payload["use_int8_w8a16"]
-    block_quant_shape = payload["block_quant_shape"]
-
-    # save the config attached to smash_config, inside the hf and vllm caches.
-    save_configs(
-        best_configs,
-        num_experts,
-        shard_intermediate_size,
-        dtype,
-        use_fp8_w8a8,
-        use_int8_w8a16,
-        block_quant_shape,
-        smash_config["moe_kernel_tuner_path_to_huggingface_hub_cache"],
-        smash_config["moe_kernel_tuner_path_to_vllm_cache"],
-        imported_packages,
-    )
-    smash_config.load_fns.remove(LOAD_FUNCTIONS.moe_kernel_tuner.name)
-    return load_transformers_model(path, smash_config, **kwargs)
+    else:
+        best_configs = payload["best_configs_moe_kernel"]
+        num_experts = payload["num_experts"]
+        shard_intermediate_size = payload["shard_intermediate_size"]
+        dtype = payload["dtype"]
+        # Convert dtype string back to torch.dtype if needed
+        dtype = torch.bfloat16 if dtype == "bfloat16" else torch.float16
+        use_fp8_w8a8 = payload["use_fp8_w8a8"]
+        use_int8_w8a16 = payload["use_int8_w8a16"]
+
+        # save the config attached to smash_config, inside the hf and vllm caches.
+        save_configs(
+            best_configs,
+            num_experts,
+            shard_intermediate_size,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            None,
+            smash_config["moe_kernel_tuner_path_to_huggingface_hub_cache"],
+            smash_config["moe_kernel_tuner_path_to_vllm_cache"],
+            imported_packages,
+        )
+        smash_config.load_fns.remove(LOAD_FUNCTIONS.moe_kernel_tuner.name)
+        return load_transformers_model(path, smash_config, **kwargs)
 
 
 class LOAD_FUNCTIONS(Enum):  # noqa: N801

From 1d6cc7d30948b4066975b271f12182b4a683d1fe Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Wed, 24 Dec 2025 09:16:31 +0000
Subject: [PATCH 16/17] fix: npdoc space issue

---
 src/pruna/algorithms/moe_kernel_tuner.py | 84 +++++++++++-------------
 src/pruna/engine/load.py                 |  8 +--
 2 files changed, 44 insertions(+), 48 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index 49562d71..daf477be 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -347,33 +347,33 @@ def tune(
 
     Parameters
     ----------
-    num_tokens: int
+    num_tokens : int
         The number of tokens in the batch.
-    num_experts: int
+    num_experts : int
         The number of experts.
-    shard_intermediate_size: int
+    shard_intermediate_size : int
         The intermediate size of the model in the shard (if using tensor parallelism).
-    hidden_size: int
+    hidden_size : int
         The hidden size of the model.
-    topk: int
+    topk : int
         The number of active experts per token.
-    dtype: torch.dtype
+    dtype : torch.dtype
         The dtype to use for the weights and activations.
-    use_fp8_w8a8: bool
+    use_fp8_w8a8 : bool
         Whether to use fp8_w8a8.
-    use_int8_w8a16: bool
+    use_int8_w8a16 : bool
         Whether to use int8_w8a16.
-    search_space: list[dict[str, int]]
+    search_space : list[dict[str, int]]
         The search space for the kernel (tiling and warp scheduling).
-    block_quant_shape: list[int]
+    block_quant_shape : list[int]
         The block shape for the kernel (None here).
-    use_deep_gemm: bool
+    use_deep_gemm : bool
         Whether to use deep gemm (False here).
-    imported_packages: Dict[str, Any]
+    imported_packages : Dict[str, Any]
         The imported packages (vllm, triton, etc.).
-    seed: int
+    seed : int
         The random seed.
-    num_iters: int
+    num_iters : int
         The number of iterations to average the kernel time on.
 
     Returns
@@ -421,7 +421,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
 
     Parameters
     ----------
-    config: BenchmarkConfig
+    config : BenchmarkConfig
         The configuration to sort.
 
     Returns
@@ -454,9 +454,9 @@ def get_configs_compute_bound(use_fp16: bool, smash_config: SmashConfigPrefixWra
 
     Parameters
     ----------
-    use_fp16: bool
+    use_fp16 : bool
         Whether to use fp16.
-    smash_config: SmashConfigPrefixWrapper
+    smash_config : SmashConfigPrefixWrapper
         The Smash configuration.
 
     Returns
@@ -514,31 +514,31 @@ def benchmark_config(
 
     Parameters
     ----------
-    config: BenchmarkConfig
+    config : BenchmarkConfig
         The configuration to benchmark.
-    num_tokens: int
+    num_tokens : int
         The number of tokens in the batch.
-    num_experts: int
+    num_experts : int
         The number of experts.
-    shard_intermediate_size: int
+    shard_intermediate_size : int
         The intermediate size of the model in the shard (if using tensor parallelism).
-    hidden_size: int
+    hidden_size : int
         The hidden size of the model.
-    topk: int
+    topk : int
         The number of active experts per token.
-    dtype: torch.dtype
+    dtype : torch.dtype
         The dtype to use for the weights and activations.
-    use_fp8_w8a8: bool
+    use_fp8_w8a8 : bool
         Whether to use fp8_w8a8.
-    use_int8_w8a16: bool
+    use_int8_w8a16 : bool
         Whether to use int8_w8a16.
-    num_iters: int
+    num_iters : int
         The number of iterations to run the benchmark.
-    block_quant_shape: list[int]
+    block_quant_shape : list[int]
         The block shape for the kernel (None here).
-    use_deep_gemm: bool
+    use_deep_gemm : bool
         Whether to use deep gemm (False here).
-    imported_packages: Dict[str, Any]
+    imported_packages : Dict[str, Any]
         The imported packages (vllm, triton, etc.).
 
     Returns
@@ -715,29 +715,25 @@ def save_configs(
 
     Parameters
     ----------
-    configs: dict[int, BenchmarkConfig]
+    configs : dict[int, BenchmarkConfig]
         The best configs.
-    num_experts: int
+    num_experts : int
         The number of experts.
-    shard_intermediate_size: int
+    shard_intermediate_size : int
         The intermediate size of the model in the shard (if using tensor parallelism).
-    hidden_size: int
-        The hidden size of the model.
-    topk: int
-        The number of active experts per token.
-    dtype: torch.dtype
+    dtype : torch.dtype
         The dtype to use for the weights and activations.
-    use_fp8_w8a8: bool
+    use_fp8_w8a8 : bool
         Whether to use fp8_w8a8.
-    use_int8_w8a16: bool
+    use_int8_w8a16 : bool
         Whether to use int8_w8a16.
-    block_quant_shape: list[int]
+    block_quant_shape : list[int]
         The block shape for the kernel (None here).
-    path_to_huggingface_hub_cache: str
+    path_to_huggingface_hub_cache : str
         The path to the huggingface hub cache.
-    path_to_vllm_cache: str
+    path_to_vllm_cache : str
         The path to the vllm cache.
-    imported_packages: Dict[str, Any]
+    imported_packages : Dict[str, Any]
         The imported packages (vllm, triton, etc.).
     """
     dtype_str = imported_packages["_get_config_dtype_str"](
diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index 993a100b..4f13433f 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -550,17 +550,17 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
 
     Parameters
     ----------
-    path: str | Path
+    path : str | Path
         The path to the model directory.
-    smash_config: SmashConfig
+    smash_config : SmashConfig
         The SmashConfig object containing the best configs for the MoE kernel tuner.
-    **kwargs: Any
+    **kwargs : Any
         Additional keyword arguments to pass to the model loading function.
 
     Returns
     -------
     Any
-        The loaded model.
+        The loaded MoE model.
     """
     from pruna.algorithms.moe_kernel_tuner import MoeKernelTuner, save_configs
 

From a2a3511e4ef3e665741c343f0986b0b96814e315 Mon Sep 17 00:00:00 2001
From: llcnt <louis.leconte@ens-paris-saclay.fr>
Date: Wed, 7 Jan 2026 14:49:05 +0000
Subject: [PATCH 17/17] fix: minor bugs from review

---
 src/pruna/algorithms/moe_kernel_tuner.py | 13 ++++++++++---
 src/pruna/config/smash_config.py         |  1 +
 src/pruna/engine/load.py                 |  4 +++-
 src/pruna/engine/model_checks.py         |  2 +-
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
index daf477be..e9b5734d 100644
--- a/src/pruna/algorithms/moe_kernel_tuner.py
+++ b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -197,8 +197,8 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         )
         # qwen_moe can use different intermediate size compared to mixtral.
         intermediate_size = (
-            model_config.moe_intermediate_size
-            if model_config.moe_intermediate_size is not None
+            getattr(model_config, "moe_intermediate_size", None)
+            if getattr(model_config, "moe_intermediate_size", None) is not None
             else model_config.intermediate_size
         )
         hidden_size = model_config.hidden_size  # model hidden dim
@@ -246,6 +246,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
             outputs.append(output)
 
         configs = ray.get(outputs)
+        ray.shutdown()
 
         # (iv) Sort the configs by batch size and save the best configs
         best_configs = {
@@ -411,7 +412,13 @@ def tune(
 
     now = datetime.now()
     pruna_logger.info(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
-    assert best_config is not None
+    if best_config is None:
+        raise RuntimeError(
+            f"No valid kernel configuration was found for batch_size={num_tokens}. "
+            "All configurations failed (e.g., due to OutOfResources). "
+            "This can happen on GPUs with limited resources. "
+            "Consider reducing your model size, batch size, or tuning search space."
+        )
     return best_config
 
 
diff --git a/src/pruna/config/smash_config.py b/src/pruna/config/smash_config.py
index b57b6149..43c659d6 100644
--- a/src/pruna/config/smash_config.py
+++ b/src/pruna/config/smash_config.py
@@ -354,6 +354,7 @@ def flush_configuration(self) -> None:
         self.save_fns = []
         self.load_fns = []
         self.reapply_after_load = {}
+        self.artifacts = {}
 
         # reset potentially previously used cache directory
         self.reset_cache_dir()
diff --git a/src/pruna/engine/load.py b/src/pruna/engine/load.py
index 4f13433f..78500529 100644
--- a/src/pruna/engine/load.py
+++ b/src/pruna/engine/load.py
@@ -567,10 +567,12 @@ def load_moe_kernel_tuner(path: str | Path, smash_config: SmashConfig, **kwargs)
     imported_packages = MoeKernelTuner().import_algorithm_packages()
     payload = getattr(smash_config, "artifacts", {}).get("moe_kernel_tuner")
     if not payload:
-        pruna_logger.error(
+        error_msg = (
             "MoE kernel tuner artifacts not found in SmashConfig. "
             "Ensure the tuner ran successfully before saving/loading."
         )
+        pruna_logger.error(error_msg)
+        raise RuntimeError(error_msg)
     else:
         best_configs = payload["best_configs_moe_kernel"]
         num_experts = payload["num_experts"]
diff --git a/src/pruna/engine/model_checks.py b/src/pruna/engine/model_checks.py
index 6a4bbb75..757925b9 100644
--- a/src/pruna/engine/model_checks.py
+++ b/src/pruna/engine/model_checks.py
@@ -121,7 +121,7 @@ def is_moe_lm(model: Any) -> bool:
     bool
         True if the model is a MoE LM, False otherwise.
     """
-    return hasattr(model, "num_experts")
+    return hasattr(getattr(model, "config", None), "num_experts")
 
 
 def is_transformers_pipeline_with_causal_lm(model: Any) -> bool: