From 14dfdc561f4d963be9ea6af137a1838d57e57c55 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Tue, 30 Dec 2025 13:04:27 -0600
Subject: [PATCH 1/9] add cache of preloaded models and use max_dynamic batch
 size for multiple batch models created then loaded into memory on
 initialization

---
 .../migraphx/migraphx_execution_provider.cc   | 367 +++++++++++++-----
 .../migraphx/migraphx_execution_provider.h    |   9 +
 2 files changed, 288 insertions(+), 88 deletions(-)
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index fb93c72320d45..b1e158cae70d4 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -154,7 +154,8 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv
       metadef_id_generator_{ModelMetadefIdGenerator::Create()},
       external_alloc_{info.external_alloc},
       external_free_{info.external_free},
-      external_empty_cache_{info.external_empty_cache} {
+      external_empty_cache_{info.external_empty_cache},
+      max_dynamic_batch_{info.max_dynamic_batch} {
   InitProviderOrtApi();
 
   // Set GPU device to be used and read device properties for feature usage.
@@ -180,6 +181,13 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv
   GET_ENV_BOOL(migraphx_env_vars::kDumpModelOps, dump_model_ops_);
   GET_ENV_BOOL(migraphx_env_vars::kExhaustiveTune, exhaustive_tune_);
 
+  // Get max dynamic batch size from environment variable
+  const auto max_dynamic_batch_env{GetEnvironmentVar(migraphx_env_vars::kModelMaxDynamicBatch)};
+  if (!max_dynamic_batch_env.empty()) {
+    max_dynamic_batch_ = std::stoull(max_dynamic_batch_env);
+    LOGS_DEFAULT(INFO) << "\n " << migraphx_env_vars::kModelMaxDynamicBatch << ": " << max_dynamic_batch_;
+  }
+
   // Verify configuration correctness and adjust accordingly.
 
 #if HIP_VERSION_MAJOR < 6 || (HIP_VERSION_MAJOR == 6 && (HIP_VERSION_MINOR < 4 || (HIP_VERSION_MINOR == 4 && HIP_VERSION_PATCH < 2)))
@@ -237,7 +245,8 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv
                         << "\n " << migraphx_provider_option::kInt8CalibTable << ": " << int8_calibration_table_name_
                         << "\n int8_calibration_cache_available: " << int8_calibration_cache_available_
                         << "\n " << migraphx_provider_option::kInt8UseNativeCalibTable << ": " << int8_use_native_calibration_table_
-                        << "\n " << migraphx_provider_option::kModelCacheDir << ": " << model_cache_path_;
+                        << "\n " << migraphx_provider_option::kModelCacheDir << ": " << model_cache_path_
+                        << "\n " << migraphx_provider_option::kModelMaxDynamicBatch << ": " << max_dynamic_batch_;
 }
 
 std::vector<AllocatorPtr> MIGraphXExecutionProvider::CreatePreferredAllocators() {
@@ -1315,6 +1324,82 @@ std::string make_hash(const char* v) {
 constexpr std::uint64_t MIGraphX_Version =
     ((MIGRAPHX_VERSION_MAJOR << 16) | (MIGRAPHX_VERSION_MINOR << 8) | MIGRAPHX_VERSION_PATCH);
 
+// Helper function to get power of 2 batch sizes up to max
+std::vector<size_t> GetPowerOf2BatchSizes(size_t max_batch) {
+  std::vector<size_t> batch_sizes;
+  if (max_batch == 0) return batch_sizes;
+
+  for (size_t batch = 1; batch <= max_batch; batch *= 2) {
+    batch_sizes.push_back(batch);
+  }
+
+  // If max_batch is not a power of 2, add it
+  if (batch_sizes.empty() || batch_sizes.back() != max_batch) {
+    batch_sizes.push_back(max_batch);
+  }
+
+  return batch_sizes;
+}
+
+// Helper: Compile a single program with specific batch size
+migraphx::program CompileProgramWithBatch(
+    const std::string& onnx_string,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::int64_t>& base_input_shape,
+    size_t batch_size,
+    migraphx::onnx_options& options,
+    const migraphx::target& t,
+    bool fp16_enable,
+    bool bf16_enable,
+    bool int8_enable,
+    bool fp8_enable,
+    bool int8_calibration_cache_available,
+    std::unordered_map<std::string, float>& dynamic_range_map,
+    bool exhaustive_tune,
+    const std::filesystem::path& model_path) {
+
+  LOGS_DEFAULT(VERBOSE) << "[PreCompile] Compiling for batch size: " << batch_size;
+
+  // Set input shapes with the specified batch size
+  std::vector<std::size_t> shape_with_batch;
+  shape_with_batch.push_back(batch_size);
+  for (size_t i = 1; i < base_input_shape.size(); ++i) {
+    shape_with_batch.push_back(static_cast<std::size_t>(base_input_shape[i]));
+  }
+
+  // Assume single input for now (can be extended)
+  if (!input_names.empty()) {
+    options.set_input_parameter_shape(input_names[0], shape_with_batch);
+
+    std::ostringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < shape_with_batch.size(); ++i) {
+      if (i > 0) ss << ", ";
+      ss << shape_with_batch[i];
+    }
+    ss << "]";
+    LOGS_DEFAULT(VERBOSE) << "[PreCompile] Input '" << input_names[0] << "' shape: " << ss.str();
+  }
+
+#ifndef ENABLE_TRAINING_CORE
+#ifdef HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH
+  if (!model_path.empty()) {
+    options.set_external_data_path(model_path.parent_path().string());
+  }
+#endif
+#endif
+
+  migraphx::program prog = migraphx::parse_onnx_buffer(onnx_string, options);
+  migraphx::program_parameters quant_params;
+
+  calibrate_and_quantize(prog, t, quant_params, fp16_enable, bf16_enable, int8_enable,
+                         fp8_enable, int8_calibration_cache_available, dynamic_range_map);
+  compile_program(prog, t, exhaustive_tune);
+
+  LOGS_DEFAULT(VERBOSE) << "[PreCompile] Compilation complete for batch size: " << batch_size;
+  return prog;
+}
+
 Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                                           std::vector<NodeComputeInfo>& node_compute_funcs) {
   migraphx::onnx_options options;
@@ -1509,6 +1594,69 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
         LOGS_DEFAULT(VERBOSE) << "[Compile] Saving compiled model to cache: " << model_cache_file.string();
         save_compiled_model(prog, model_cache_file);
         LOGS_DEFAULT(VERBOSE) << "[Compile] Model saved successfully with batch-aware filename";
+
+        // Pre-compile for power-of-2 batch sizes if max_dynamic_batch is set and this is first compilation
+        if (max_dynamic_batch_ > 1 && !precompile_done_ && !input_names.empty()) {
+          LOGS_DEFAULT(INFO) << "[PreCompile] Starting pre-compilation for batch sizes up to " << max_dynamic_batch_;
+
+          auto batch_sizes = GetPowerOf2BatchSizes(max_dynamic_batch_);
+          LOGS_DEFAULT(INFO) << "[PreCompile] Will compile for " << batch_sizes.size() << " batch sizes";
+
+          // Get base input shape (without batch dimension)
+          std::vector<std::int64_t> base_shape;
+          if (!input_tensor.empty() && input_tensor[0]->Shape() != nullptr) {
+            auto tensor_shape = input_tensor[0]->Shape();
+            for (int j = 1; j < tensor_shape->dim_size(); ++j) {
+              if (tensor_shape->dim(j).has_dim_value()) {
+                base_shape.push_back(tensor_shape->dim(j).dim_value());
+              }
+            }
+          }
+
+          // Pre-compile for each batch size
+          for (size_t batch : batch_sizes) {
+            try {
+              LOGS_DEFAULT(INFO) << "[PreCompile] Compiling for batch size: " << batch;
+
+              // Build shape with this batch size
+              std::vector<std::int64_t> batch_input_shapes;
+              batch_input_shapes.push_back(batch);
+              batch_input_shapes.insert(batch_input_shapes.end(), base_shape.begin(), base_shape.end());
+
+              // Generate cache file name
+              auto batch_cache_hash = make_hash(batch_input_shapes);
+              auto batch_cache_file = model_cache_path_ / (mxr_filename_prefix + batch_cache_hash + ".mxr");
+
+              migraphx::program batch_prog;
+              if (!load_precompiled_model(batch_prog, batch_cache_file)) {
+                // Compile if not in cache
+                migraphx::onnx_options batch_options = options;
+                batch_prog = CompileProgramWithBatch(
+                    onnx_string_buffer, input_names, batch_input_shapes, batch,
+                    batch_options, t_, fp16_enable_, bf16_enable_, int8_enable_, fp8_enable_,
+                    int8_calibration_cache_available_, dynamic_range_map_, exhaustive_tune_, model_path_);
+
+                // Save to disk
+                save_compiled_model(batch_prog, batch_cache_file);
+                LOGS_DEFAULT(INFO) << "[PreCompile] Saved batch " << batch << " model to: " << batch_cache_file.string();
+              } else {
+                LOGS_DEFAULT(INFO) << "[PreCompile] Loaded pre-existing model for batch " << batch;
+              }
+
+              // Store in batch cache
+              {
+                std::lock_guard<std::mutex> lock(batch_cache_mutex_);
+                batch_program_cache_[fused_node.Name()][batch] = std::move(batch_prog);
+              }
+
+            } catch (const std::exception& e) {
+              LOGS_DEFAULT(WARNING) << "[PreCompile] Failed to compile batch " << batch << ": " << e.what();
+            }
+          }
+
+          precompile_done_ = true;
+          LOGS_DEFAULT(INFO) << "[PreCompile] Pre-compilation complete";
+        }
       } else {
         LOGS_DEFAULT(VERBOSE) << "[Compile] Cache hit! Loaded precompiled model from: " << model_cache_file.string();
       }
@@ -1541,6 +1689,10 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
     map_onnx_string_[fused_node.Name()] = onnx_string_buffer;
     map_input_index_[fused_node.Name()] = input_name_index;
     map_no_input_shape_[fused_node.Name()] = no_input_shape;
+
+    // Initialize batch program cache for this node
+    batch_program_cache_[fused_node.Name()] = std::map<size_t, migraphx::program>();
+
     NodeComputeInfo compute_info;
     compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
       std::unique_ptr<MIGraphXFuncState> p = std::make_unique<MIGraphXFuncState>();
@@ -1548,7 +1700,8 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
             map_onnx_string_[context->node_name], options, t_, map_input_index_[context->node_name], &mgx_mu_,
             map_no_input_shape_[context->node_name], fp16_enable_, bf16_enable_, fp8_enable_, int8_enable_,
             int8_calibration_cache_available_, dynamic_range_map_,
-            model_cache_path_.string(), dump_model_ops_};
+            model_cache_path_.string(), dump_model_ops_, exhaustive_tune_, max_dynamic_batch_,
+            &batch_program_cache_[context->node_name], &batch_cache_mutex_, std::string(context->node_name)};
       *state = p.release();
       return 0;
     };
@@ -1685,105 +1838,134 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       // input shapes are different, needs to re-parse onnx and
       // re-compile the program
       if (!input_shape_match) {
-        LOGS_DEFAULT(VERBOSE) << "[Compute] Input shape mismatch detected, initiating recompilation";
-
-        std::filesystem::path model_cache_file;
-        // empty cache path means the MXR caching is disabled - always compile
-        if (!model_cache_path_.empty()) {
-          // Ensure input_shapes has all updated dimensions including new batch sizes
-          if (input_shapes.empty()) {
-            LOGS_DEFAULT(WARNING) << "[Compute] Input shapes vector is empty, rebuilding from current inputs";
-            for (auto&& name : param_shapes.names()) {
-              if (map_input_name_index.count(name) > 0) {
-                auto input_tensor = ctx.GetInput(map_input_name_index[name]);
-                auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-                const auto tensor_shape = tensor_info.GetShape();
-                input_shapes.insert(input_shapes.end(), tensor_shape.begin(), tensor_shape.end());
-              }
-            }
-          }
+        LOGS_DEFAULT(VERBOSE) << "[Compute] Input shape mismatch detected";
 
-          // Log the shapes being used for cache key generation
-          std::ostringstream shapes_str;
-          shapes_str << "[";
-          for (size_t i = 0; i < input_shapes.size(); ++i) {
-            if (i > 0) shapes_str << ", ";
-            shapes_str << input_shapes[i];
+        // Extract batch size from first input
+        size_t requested_batch = 0;
+        if (!map_input_name_index.empty()) {
+          auto first_input = ctx.GetInput(map_input_name_index.begin()->second);
+          auto tensor_info = first_input.GetTensorTypeAndShapeInfo();
+          const auto tensor_shape = tensor_info.GetShape();
+          if (!tensor_shape.empty()) {
+            requested_batch = static_cast<size_t>(tensor_shape[0]);
           }
-          shapes_str << "]";
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Cache key input shapes (including updated batch): " << shapes_str.str();
+        }
+
+        LOGS_DEFAULT(VERBOSE) << "[Compute] Requested batch size: " << requested_batch;
 
-          auto cache_hash = make_hash(input_shapes);
-          model_cache_file = mgx_state->model_cache_dir / (mxr_filename_prefix + cache_hash + ".mxr");
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Cache file with batch-aware hash: " << model_cache_file.string();
+        // Try to find program in batch cache first
+        bool found_in_cache = false;
+        {
+          std::lock_guard<std::mutex> lock(*mgx_state->batch_cache_mutex_ptr);
+          auto& batch_cache = *mgx_state->batch_program_cache_ptr;
+
+          if (batch_cache.find(requested_batch) != batch_cache.end()) {
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Found program in batch cache for batch size: " << requested_batch;
+            prog = batch_cache[requested_batch];
+            found_in_cache = true;
+          }
         }
 
-        if (!load_precompiled_model(prog, model_cache_file)) {
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Cache miss. Compiling model with updated batch size";
+        if (!found_in_cache) {
+          LOGS_DEFAULT(VERBOSE) << "[Compute] Batch size " << requested_batch << " not in cache, need to compile";
+
+          std::filesystem::path model_cache_file;
+          // empty cache path means the MXR caching is disabled - always compile
+          if (!model_cache_path_.empty()) {
+            // Ensure input_shapes has all updated dimensions including new batch sizes
+            if (input_shapes.empty()) {
+              LOGS_DEFAULT(WARNING) << "[Compute] Input shapes vector is empty, rebuilding from current inputs";
+              for (auto&& name : param_shapes.names()) {
+                if (map_input_name_index.count(name) > 0) {
+                  auto input_tensor = ctx.GetInput(map_input_name_index[name]);
+                  auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+                  const auto tensor_shape = tensor_info.GetShape();
+                  input_shapes.insert(input_shapes.end(), tensor_shape.begin(), tensor_shape.end());
+                }
+              }
+            }
 
-          // CRITICAL: Ensure ALL input parameter shapes are explicitly set as static shapes in cmp_options
-          // This must be done BEFORE parsing to treat dynamic shapes as static for compilation
-          // NOTE: Only set shapes for actual runtime input parameters, NOT for constants/initializers
-          // MIGraphX will automatically infer shapes for constants and intermediate tensors
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Setting " << map_input_name_index.size()
-                                << " input parameter shapes as static in MIGraphX options (excluding constants)";
+            // Log the shapes being used for cache key generation
+            std::ostringstream shapes_str;
+            shapes_str << "[";
+            for (size_t i = 0; i < input_shapes.size(); ++i) {
+              if (i > 0) shapes_str << ", ";
+              shapes_str << input_shapes[i];
+            }
+            shapes_str << "]";
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Cache key input shapes (including updated batch): " << shapes_str.str();
 
-          for (auto& it : map_input_name_index) {
-            auto& name = it.first;
-            auto& index = it.second;
-            auto input_tensor = ctx.GetInput(index);
-            auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-            const auto tensor_shape = tensor_info.GetShape();
-            std::vector<std::size_t> ort_lens(tensor_shape.begin(), tensor_shape.end());
-
-            // Set shape as static parameter for MIGraphX compilation
-            // Only for actual input parameters - constants/initializers are handled by MIGraphX
-            cmp_options.set_input_parameter_shape(name, ort_lens);
-
-            LOGS_DEFAULT(VERBOSE) << "[Compute] Set static shape for input parameter '" << name << "': ["
-                                  << [&]() {
-                                      std::ostringstream ss;
-                                      for (size_t i = 0; i < ort_lens.size(); ++i) {
-                                        if (i > 0) ss << ", ";
-                                        ss << ort_lens[i];
-                                      }
-                                      return ss.str();
-                                    }() << "]";
+            auto cache_hash = make_hash(input_shapes);
+            model_cache_file = mgx_state->model_cache_dir / (mxr_filename_prefix + cache_hash + ".mxr");
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Looking for MXR file: " << model_cache_file.string();
           }
-          LOGS_DEFAULT(VERBOSE) << "[Compute] All input parameter shapes set as static";
-          LOGS_DEFAULT(VERBOSE) << "[Compute] MIGraphX will infer shapes for constants and intermediate tensors";
+
+          if (!load_precompiled_model(prog, model_cache_file)) {
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Cache miss. Compiling model with updated batch size";
+
+            // CRITICAL: Ensure ALL input parameter shapes are explicitly set as static shapes in cmp_options
+            // This must be done BEFORE parsing to treat dynamic shapes as static for compilation
+            // NOTE: Only set shapes for actual runtime input parameters, NOT for constants/initializers
+            // MIGraphX will automatically infer shapes for constants and intermediate tensors
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Setting " << map_input_name_index.size()
+                                  << " input parameter shapes as static in MIGraphX options (excluding constants)";
+
+            for (auto& it : map_input_name_index) {
+              auto& name = it.first;
+              auto& index = it.second;
+              auto input_tensor = ctx.GetInput(index);
+              auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+              const auto tensor_shape = tensor_info.GetShape();
+              std::vector<std::size_t> ort_lens(tensor_shape.begin(), tensor_shape.end());
+
+              // Set shape as static parameter for MIGraphX compilation
+              // Only for actual input parameters - constants/initializers are handled by MIGraphX
+              cmp_options.set_input_parameter_shape(name, ort_lens);
+
+              LOGS_DEFAULT(VERBOSE) << "[Compute] Set static shape for input parameter '" << name << "': ["
+                                    << [&]() {
+                                        std::ostringstream ss;
+                                        for (size_t i = 0; i < ort_lens.size(); ++i) {
+                                          if (i > 0) ss << ", ";
+                                          ss << ort_lens[i];
+                                        }
+                                        return ss.str();
+                                      }() << "]";
+            }
+            LOGS_DEFAULT(VERBOSE) << "[Compute] All input parameter shapes set as static";
+            LOGS_DEFAULT(VERBOSE) << "[Compute] MIGraphX will infer shapes for constants and intermediate tensors";
 
 #ifndef ENABLE_TRAINING_CORE
 #ifdef HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH
-          cmp_options.set_external_data_path(model_path_.parent_path().string());
+            cmp_options.set_external_data_path(model_path_.parent_path().string());
 #endif
 #endif
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Parsing ONNX buffer with static input shapes";
-          prog = migraphx::parse_onnx_buffer(onnx_string, cmp_options);
-          LOGS_DEFAULT(VERBOSE) << "[Compute] ONNX parsing complete";
-
-          // Verify that MIGraphX parsed with correct shapes for input parameters
-          auto parsed_param_shapes = prog.get_parameter_shapes();
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Verifying parsed parameter shapes ("
-                                << parsed_param_shapes.size() << " total parameters):";
-          for (auto&& param_name : parsed_param_shapes.names()) {
-            auto shape = parsed_param_shapes[param_name];
-            auto lens = shape.lengths();
-            std::ostringstream ss;
-            ss << "[";
-            for (size_t i = 0; i < lens.size(); ++i) {
-              if (i > 0) ss << ", ";
-              ss << lens[i];
-            }
-            ss << "]";
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Parsing ONNX buffer with static input shapes";
+            prog = migraphx::parse_onnx_buffer(onnx_string, cmp_options);
+            LOGS_DEFAULT(VERBOSE) << "[Compute] ONNX parsing complete";
+
+            // Verify that MIGraphX parsed with correct shapes for input parameters
+            auto parsed_param_shapes = prog.get_parameter_shapes();
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Verifying parsed parameter shapes ("
+                                  << parsed_param_shapes.size() << " total parameters):";
+            for (auto&& param_name : parsed_param_shapes.names()) {
+              auto shape = parsed_param_shapes[param_name];
+              auto lens = shape.lengths();
+              std::ostringstream ss;
+              ss << "[";
+              for (size_t i = 0; i < lens.size(); ++i) {
+                if (i > 0) ss << ", ";
+                ss << lens[i];
+              }
+              ss << "]";
 
-            // Distinguish between input parameters we set and constants MIGraphX inferred
-            bool is_input_param = (map_input_name_index.count(param_name) > 0);
-            LOGS_DEFAULT(VERBOSE) << "[Compute] Parameter '" << param_name << "' parsed shape: " << ss.str()
-                                  << (is_input_param ? " (input parameter)" : " (constant/internal)");
-          }
+              // Distinguish between input parameters we set and constants MIGraphX inferred
+              bool is_input_param = (map_input_name_index.count(param_name) > 0);
+              LOGS_DEFAULT(VERBOSE) << "[Compute] Parameter '" << param_name << "' parsed shape: " << ss.str()
+                                    << (is_input_param ? " (input parameter)" : " (constant/internal)");
+            }
 
-          migraphx::program_parameters quant_params;
+            migraphx::program_parameters quant_params;
 
           if ((int8_enable ^ fp8_enable) && int8_calibration_cache_available) {
             auto local_param_shapes = prog.get_parameter_shapes();
@@ -1814,9 +1996,18 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
           LOGS_DEFAULT(VERBOSE) << "[Compute] Saving compiled model with updated batch size to: "
                                 << model_cache_file.string();
           save_compiled_model(prog, model_cache_file);
+          LOGS_DEFAULT(VERBOSE) << "[Compute] Model saved to disk";
         } else {
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Cache hit! Loaded precompiled model with matching batch size";
+          LOGS_DEFAULT(VERBOSE) << "[Compute] Loaded MXR from disk: " << model_cache_file.string();
+        }
+
+        // Store in batch cache for future use
+        {
+          std::lock_guard<std::mutex> lock(*mgx_state->batch_cache_mutex_ptr);
+          (*mgx_state->batch_program_cache_ptr)[requested_batch] = prog;
+          LOGS_DEFAULT(VERBOSE) << "[Compute] Stored program in batch cache for batch size: " << requested_batch;
         }
+      }
 
         mgx_state->prog = prog;
         param_shapes = prog.get_parameter_shapes();
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index 12758b87b2cad..5e66008c6633c 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -58,6 +58,9 @@ struct MIGraphXFuncState {
   bool dump_model_ops = false;
   bool exhaustive_tune = false;
   size_t max_dynamic_batch;
+  std::map<size_t, migraphx::program>* batch_program_cache_ptr = nullptr;
+  std::mutex* batch_cache_mutex_ptr = nullptr;
+  std::string node_name;
 };
 
 // Logical device representation.
@@ -140,6 +143,12 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   std::unordered_map<std::string, std::unordered_map<std::string, std::size_t>> map_input_index_;
   std::unordered_map<std::string, bool> map_no_input_shape_;
 
+  // Cache of compiled programs indexed by batch size for each node
+  // Key: node_name, Value: map of batch_size -> program
+  std::unordered_map<std::string, std::map<size_t, migraphx::program>> batch_program_cache_;
+  std::mutex batch_cache_mutex_;  // Protect batch_program_cache_
+  bool precompile_done_ = false;  // Track if we've done initial precompilation
+
   AllocatorPtr allocator_;
   std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
   void* external_alloc_{nullptr};

From 6c93dfc4a2c0613cda9cf880f8c8f205a5852c06 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Tue, 30 Dec 2025 16:19:34 -0600
Subject: [PATCH 2/9] Update to use session batch instead of previous stored
 batch value for lookup

---
 .../migraphx/migraphx_execution_provider.cc   | 44 ++++++++++++++++---
 .../migraphx/migraphx_execution_provider.h    |  1 +
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index b1e158cae70d4..08739768fec9f 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1701,7 +1701,8 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
             map_no_input_shape_[context->node_name], fp16_enable_, bf16_enable_, fp8_enable_, int8_enable_,
             int8_calibration_cache_available_, dynamic_range_map_,
             model_cache_path_.string(), dump_model_ops_, exhaustive_tune_, max_dynamic_batch_,
-            &batch_program_cache_[context->node_name], &batch_cache_mutex_, std::string(context->node_name)};
+            &batch_program_cache_[context->node_name], &batch_cache_mutex_, std::string(context->node_name),
+            session_input_names};
       *state = p.release();
       return 0;
     };
@@ -1840,14 +1841,47 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       if (!input_shape_match) {
         LOGS_DEFAULT(VERBOSE) << "[Compute] Input shape mismatch detected";
 
-        // Extract batch size from first input
+        // Extract batch size from first ACTUAL runtime input (not constants/weights)
+        // We need to find an input that varies with batch size
         size_t requested_batch = 0;
-        if (!map_input_name_index.empty()) {
-          auto first_input = ctx.GetInput(map_input_name_index.begin()->second);
-          auto tensor_info = first_input.GetTensorTypeAndShapeInfo();
+        bool found_batch_input = false;
+
+        // First, try to get batch from actual model inputs (session-level inputs)
+        for (auto& it : map_input_name_index) {
+          auto& name = it.first;
+          auto& index = it.second;
+
+          // Skip if this looks like a weight/constant (session_input_names contains only real inputs)
+          if (mgx_state->session_input_names.count(name) == 0) {
+            continue;  // This is likely a constant/weight, skip it
+          }
+
+          auto input_tensor = ctx.GetInput(index);
+          auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
           const auto tensor_shape = tensor_info.GetShape();
+
           if (!tensor_shape.empty()) {
             requested_batch = static_cast<size_t>(tensor_shape[0]);
+            found_batch_input = true;
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Extracted batch size " << requested_batch
+                                  << " from session input '" << name << "'";
+            break;
+          }
+        }
+
+        // Fallback: if no session input found, use first available input
+        if (!found_batch_input && !map_input_name_index.empty()) {
+          for (auto& it : map_input_name_index) {
+            auto input_tensor = ctx.GetInput(it.second);
+            auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+            const auto tensor_shape = tensor_info.GetShape();
+
+            if (!tensor_shape.empty()) {
+              requested_batch = static_cast<size_t>(tensor_shape[0]);
+              LOGS_DEFAULT(WARNING) << "[Compute] Extracted batch size " << requested_batch
+                                    << " from input '" << it.first << "' (no session input found)";
+              break;
+            }
           }
         }
 
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index 5e66008c6633c..394744efd3508 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -61,6 +61,7 @@ struct MIGraphXFuncState {
   std::map<size_t, migraphx::program>* batch_program_cache_ptr = nullptr;
   std::mutex* batch_cache_mutex_ptr = nullptr;
   std::string node_name;
+  std::set<std::string> session_input_names;  // Track actual model inputs vs constants
 };
 
 // Logical device representation.

From 978c9f455b9a7045fa3fadd71df8aa337b712853 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Thu, 1 Jan 2026 23:46:39 -0600
Subject: [PATCH 3/9] Put precompiled mxrs into lookup to allow for lookup
 based compiled batch sizes

---
 .../migraphx/migraphx_execution_provider.cc   | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 08739768fec9f..f9a4f764f4a4e 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1693,6 +1693,65 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
     // Initialize batch program cache for this node
     batch_program_cache_[fused_node.Name()] = std::map<size_t, migraphx::program>();
 
+    // Pre-load existing cached .mxr files into batch_program_cache_ if max_dynamic_batch_ > 1
+    // This eliminates repeated disk lookups during inference
+    if (max_dynamic_batch_ > 1 && !model_cache_path_.empty() && !no_input_shape) {
+      LOGS_DEFAULT(INFO) << "[Compile] Pre-loading existing cached models for fast lookup...";
+      std::lock_guard<std::mutex> lock(batch_cache_mutex_);
+
+      // Generate batch sizes to check (powers of 2 up to max_dynamic_batch_)
+      std::vector<size_t> batch_sizes_to_check;
+      for (size_t batch = 1; batch <= max_dynamic_batch_; batch *= 2) {
+        batch_sizes_to_check.push_back(batch);
+      }
+      // Add max_dynamic_batch_ if it's not a power of 2
+      if (batch_sizes_to_check.empty() || batch_sizes_to_check.back() != max_dynamic_batch_) {
+        batch_sizes_to_check.push_back(max_dynamic_batch_);
+      }
+
+      // Get base shape from first input tensor (excluding batch dimension)
+      std::vector<std::int64_t> base_shape;
+      if (!input_tensor.empty() && input_tensor[0]->Shape() != nullptr) {
+        auto tensor_shape = input_tensor[0]->Shape();
+        for (int j = 1; j < tensor_shape->dim_size(); ++j) {
+          const auto& dim = tensor_shape->dim(j);
+          if (dim.has_dim_value()) {
+            base_shape.push_back(dim.dim_value());
+          } else {
+            base_shape.push_back(1);  // Default for symbolic dims
+          }
+        }
+      }
+
+      int loaded_count = 0;
+      for (size_t batch : batch_sizes_to_check) {
+        // Build input shapes with this batch size
+        std::vector<std::int64_t> batch_input_shapes;
+        batch_input_shapes.push_back(static_cast<std::int64_t>(batch));
+        batch_input_shapes.insert(batch_input_shapes.end(), base_shape.begin(), base_shape.end());
+
+        if (!batch_input_shapes.empty()) {
+          auto batch_cache_hash = make_hash(batch_input_shapes);
+          auto batch_cache_file = model_cache_path_ / (mxr_filename_prefix + batch_cache_hash + ".mxr");
+
+          migraphx::program batch_prog;
+          if (load_precompiled_model(batch_prog, batch_cache_file)) {
+            batch_program_cache_[fused_node.Name()][batch] = std::move(batch_prog);
+            loaded_count++;
+            LOGS_DEFAULT(INFO) << "[Compile] Pre-loaded cached model for batch size " << batch;
+          } else {
+            LOGS_DEFAULT(VERBOSE) << "[Compile] No cached model found for batch size " << batch;
+          }
+        }
+      }
+
+      if (loaded_count > 0) {
+        LOGS_DEFAULT(INFO) << "[Compile] Pre-loaded " << loaded_count << " cached models into memory";
+      } else {
+        LOGS_DEFAULT(INFO) << "[Compile] No pre-existing cached models found";
+      }
+    }
+
     NodeComputeInfo compute_info;
     compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
       std::unique_ptr<MIGraphXFuncState> p = std::make_unique<MIGraphXFuncState>();

From 8fd67a676eaf70a0cc914b8dd867d8c1f26750f4 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Thu, 1 Jan 2026 23:50:23 -0600
Subject: [PATCH 4/9] [AI Generated] Disable output shape verification unless
 verbose logging is enabled

- Wrap the output shape verification loop in a check for verbose logging mode
- Only runs the shape verification code when logging severity <= kVERBOSE
- Reduces overhead during normal inference execution
---
 .../migraphx/migraphx_execution_provider.cc   | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index f9a4f764f4a4e..b02c23eb27656 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -2201,19 +2201,21 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
         auto prog_outputs = prog.run_async(m, static_cast<hipStream_t>(rocm_stream));
         LOGS_DEFAULT(VERBOSE) << "[Compute] Execution complete, got " << prog_outputs.size() << " outputs";
 
-        // Verify actual output shapes match expectations
-        for (std::size_t i = 0; i < prog_outputs.size(); ++i) {
-          auto actual_shape = prog_outputs[i].get_shape();
-          auto actual_lens = actual_shape.lengths();
-          std::ostringstream ss;
-          ss << "[";
-          for (size_t j = 0; j < actual_lens.size(); ++j) {
-            if (j > 0) ss << ", ";
-            ss << actual_lens[j];
+        // Verify actual output shapes match expectations (only in verbose mode)
+        if (logging::LoggingManager::DefaultLogger().GetSeverity() <= logging::Severity::kVERBOSE) {
+          for (std::size_t i = 0; i < prog_outputs.size(); ++i) {
+            auto actual_shape = prog_outputs[i].get_shape();
+            auto actual_lens = actual_shape.lengths();
+            std::ostringstream ss;
+            ss << "[";
+            for (size_t j = 0; j < actual_lens.size(); ++j) {
+              if (j > 0) ss << ", ";
+              ss << actual_lens[j];
+            }
+            ss << "]";
+            LOGS_DEFAULT(VERBOSE) << "[Compute] Actual output " << i << " shape after execution: " << ss.str()
+                                  << (actual_lens.size() > 0 ? " (batch=" + std::to_string(actual_lens[0]) + ")" : "");
           }
-          ss << "]";
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Actual output " << i << " shape after execution: " << ss.str()
-                                << (actual_lens.size() > 0 ? " (batch=" + std::to_string(actual_lens[0]) + ")" : "");
         }
 
         // In case of input parameters are reused as output parameter call hipMemcpy

From 443a4a741a762d8080475ed6e7ee2fcb4fd34053 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Fri, 2 Jan 2026 15:13:36 -0600
Subject: [PATCH 5/9] [AI Generated] Ensure all batch sizes are pre-compiled
 before compute threads start

Key changes:
1. Unified batch pre-compilation into a single block that runs after main
   compilation (both cache hit and cache miss cases)
2. Fixed hash calculation to include ALL inputs with updated batch sizes,
   not just the first input - ensures correct cache file names for
   multi-input models
3. Always compile if load_precompiled_model fails for any batch size
4. Removed duplicate pre-compilation block that ran only on cache miss
5. Removed unused CompileProgramWithBatch helper function
6. Removed unused precompile_done_ member variable
7. Only initialize batch_program_cache_ if not already initialized to
   preserve any programs compiled earlier

The batch cache is now fully populated before NodeComputeInfo is created,
ensuring compute threads have access to all pre-compiled programs.
---
 .../migraphx/migraphx_execution_provider.cc   | 406 +++++-------------
 .../migraphx/migraphx_execution_provider.h    |   1 -
 2 files changed, 112 insertions(+), 295 deletions(-)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index b02c23eb27656..51290bb747bed 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1341,65 +1341,6 @@ std::vector<size_t> GetPowerOf2BatchSizes(size_t max_batch) {
   return batch_sizes;
 }
 
-// Helper: Compile a single program with specific batch size
-migraphx::program CompileProgramWithBatch(
-    const std::string& onnx_string,
-    const std::vector<std::string>& input_names,
-    const std::vector<std::int64_t>& base_input_shape,
-    size_t batch_size,
-    migraphx::onnx_options& options,
-    const migraphx::target& t,
-    bool fp16_enable,
-    bool bf16_enable,
-    bool int8_enable,
-    bool fp8_enable,
-    bool int8_calibration_cache_available,
-    std::unordered_map<std::string, float>& dynamic_range_map,
-    bool exhaustive_tune,
-    const std::filesystem::path& model_path) {
-
-  LOGS_DEFAULT(VERBOSE) << "[PreCompile] Compiling for batch size: " << batch_size;
-
-  // Set input shapes with the specified batch size
-  std::vector<std::size_t> shape_with_batch;
-  shape_with_batch.push_back(batch_size);
-  for (size_t i = 1; i < base_input_shape.size(); ++i) {
-    shape_with_batch.push_back(static_cast<std::size_t>(base_input_shape[i]));
-  }
-
-  // Assume single input for now (can be extended)
-  if (!input_names.empty()) {
-    options.set_input_parameter_shape(input_names[0], shape_with_batch);
-
-    std::ostringstream ss;
-    ss << "[";
-    for (size_t i = 0; i < shape_with_batch.size(); ++i) {
-      if (i > 0) ss << ", ";
-      ss << shape_with_batch[i];
-    }
-    ss << "]";
-    LOGS_DEFAULT(VERBOSE) << "[PreCompile] Input '" << input_names[0] << "' shape: " << ss.str();
-  }
-
-#ifndef ENABLE_TRAINING_CORE
-#ifdef HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH
-  if (!model_path.empty()) {
-    options.set_external_data_path(model_path.parent_path().string());
-  }
-#endif
-#endif
-
-  migraphx::program prog = migraphx::parse_onnx_buffer(onnx_string, options);
-  migraphx::program_parameters quant_params;
-
-  calibrate_and_quantize(prog, t, quant_params, fp16_enable, bf16_enable, int8_enable,
-                         fp8_enable, int8_calibration_cache_available, dynamic_range_map);
-  compile_program(prog, t, exhaustive_tune);
-
-  LOGS_DEFAULT(VERBOSE) << "[PreCompile] Compilation complete for batch size: " << batch_size;
-  return prog;
-}
-
 Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                                           std::vector<NodeComputeInfo>& node_compute_funcs) {
   migraphx::onnx_options options;
@@ -1594,69 +1535,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
         LOGS_DEFAULT(VERBOSE) << "[Compile] Saving compiled model to cache: " << model_cache_file.string();
         save_compiled_model(prog, model_cache_file);
         LOGS_DEFAULT(VERBOSE) << "[Compile] Model saved successfully with batch-aware filename";
-
-        // Pre-compile for power-of-2 batch sizes if max_dynamic_batch is set and this is first compilation
-        if (max_dynamic_batch_ > 1 && !precompile_done_ && !input_names.empty()) {
-          LOGS_DEFAULT(INFO) << "[PreCompile] Starting pre-compilation for batch sizes up to " << max_dynamic_batch_;
-
-          auto batch_sizes = GetPowerOf2BatchSizes(max_dynamic_batch_);
-          LOGS_DEFAULT(INFO) << "[PreCompile] Will compile for " << batch_sizes.size() << " batch sizes";
-
-          // Get base input shape (without batch dimension)
-          std::vector<std::int64_t> base_shape;
-          if (!input_tensor.empty() && input_tensor[0]->Shape() != nullptr) {
-            auto tensor_shape = input_tensor[0]->Shape();
-            for (int j = 1; j < tensor_shape->dim_size(); ++j) {
-              if (tensor_shape->dim(j).has_dim_value()) {
-                base_shape.push_back(tensor_shape->dim(j).dim_value());
-              }
-            }
-          }
-
-          // Pre-compile for each batch size
-          for (size_t batch : batch_sizes) {
-            try {
-              LOGS_DEFAULT(INFO) << "[PreCompile] Compiling for batch size: " << batch;
-
-              // Build shape with this batch size
-              std::vector<std::int64_t> batch_input_shapes;
-              batch_input_shapes.push_back(batch);
-              batch_input_shapes.insert(batch_input_shapes.end(), base_shape.begin(), base_shape.end());
-
-              // Generate cache file name
-              auto batch_cache_hash = make_hash(batch_input_shapes);
-              auto batch_cache_file = model_cache_path_ / (mxr_filename_prefix + batch_cache_hash + ".mxr");
-
-              migraphx::program batch_prog;
-              if (!load_precompiled_model(batch_prog, batch_cache_file)) {
-                // Compile if not in cache
-                migraphx::onnx_options batch_options = options;
-                batch_prog = CompileProgramWithBatch(
-                    onnx_string_buffer, input_names, batch_input_shapes, batch,
-                    batch_options, t_, fp16_enable_, bf16_enable_, int8_enable_, fp8_enable_,
-                    int8_calibration_cache_available_, dynamic_range_map_, exhaustive_tune_, model_path_);
-
-                // Save to disk
-                save_compiled_model(batch_prog, batch_cache_file);
-                LOGS_DEFAULT(INFO) << "[PreCompile] Saved batch " << batch << " model to: " << batch_cache_file.string();
-              } else {
-                LOGS_DEFAULT(INFO) << "[PreCompile] Loaded pre-existing model for batch " << batch;
-              }
-
-              // Store in batch cache
-              {
-                std::lock_guard<std::mutex> lock(batch_cache_mutex_);
-                batch_program_cache_[fused_node.Name()][batch] = std::move(batch_prog);
-              }
-
-            } catch (const std::exception& e) {
-              LOGS_DEFAULT(WARNING) << "[PreCompile] Failed to compile batch " << batch << ": " << e.what();
-            }
-          }
-
-          precompile_done_ = true;
-          LOGS_DEFAULT(INFO) << "[PreCompile] Pre-compilation complete";
-        }
+        // Note: Batch pre-compilation happens after this block for all cases (cache hit or miss)
       } else {
         LOGS_DEFAULT(VERBOSE) << "[Compile] Cache hit! Loaded precompiled model from: " << model_cache_file.string();
       }
@@ -1690,66 +1569,119 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
     map_input_index_[fused_node.Name()] = input_name_index;
     map_no_input_shape_[fused_node.Name()] = no_input_shape;
 
-    // Initialize batch program cache for this node
-    batch_program_cache_[fused_node.Name()] = std::map<size_t, migraphx::program>();
+    // Initialize batch program cache for this node (only if not already initialized)
+    if (batch_program_cache_.find(fused_node.Name()) == batch_program_cache_.end()) {
+      batch_program_cache_[fused_node.Name()] = std::map<size_t, migraphx::program>();
+    }
 
-    // Pre-load existing cached .mxr files into batch_program_cache_ if max_dynamic_batch_ > 1
-    // This eliminates repeated disk lookups during inference
+    // Pre-compile/load programs for all batch sizes if max_dynamic_batch_ > 1
+    // This ensures all batch sizes are available in memory before compute threads start
     if (max_dynamic_batch_ > 1 && !model_cache_path_.empty() && !no_input_shape) {
-      LOGS_DEFAULT(INFO) << "[Compile] Pre-loading existing cached models for fast lookup...";
+      LOGS_DEFAULT(INFO) << "[Compile] Ensuring all batch sizes up to " << max_dynamic_batch_ << " are compiled and cached...";
       std::lock_guard<std::mutex> lock(batch_cache_mutex_);
 
-      // Generate batch sizes to check (powers of 2 up to max_dynamic_batch_)
-      std::vector<size_t> batch_sizes_to_check;
-      for (size_t batch = 1; batch <= max_dynamic_batch_; batch *= 2) {
-        batch_sizes_to_check.push_back(batch);
-      }
-      // Add max_dynamic_batch_ if it's not a power of 2
-      if (batch_sizes_to_check.empty() || batch_sizes_to_check.back() != max_dynamic_batch_) {
-        batch_sizes_to_check.push_back(max_dynamic_batch_);
-      }
+      // Generate batch sizes to compile (powers of 2 up to max_dynamic_batch_)
+      auto batch_sizes_to_compile = GetPowerOf2BatchSizes(max_dynamic_batch_);
 
-      // Get base shape from first input tensor (excluding batch dimension)
-      std::vector<std::int64_t> base_shape;
-      if (!input_tensor.empty() && input_tensor[0]->Shape() != nullptr) {
-        auto tensor_shape = input_tensor[0]->Shape();
-        for (int j = 1; j < tensor_shape->dim_size(); ++j) {
-          const auto& dim = tensor_shape->dim(j);
-          if (dim.has_dim_value()) {
-            base_shape.push_back(dim.dim_value());
-          } else {
-            base_shape.push_back(1);  // Default for symbolic dims
+      // Build base shapes for ALL inputs (excluding batch dimension)
+      // This ensures hash is calculated correctly for multi-input models
+      std::vector<std::vector<std::int64_t>> all_input_base_shapes;
+      for (size_t i = 0; i < input_tensor.size(); ++i) {
+        std::vector<std::int64_t> base_shape;
+        if (input_tensor[i]->Shape() != nullptr) {
+          auto tensor_shape = input_tensor[i]->Shape();
+          for (int j = 1; j < tensor_shape->dim_size(); ++j) {
+            const auto& dim = tensor_shape->dim(j);
+            if (dim.has_dim_value()) {
+              base_shape.push_back(dim.dim_value());
+            } else {
+              base_shape.push_back(1);  // Default for symbolic dims
+            }
           }
         }
+        all_input_base_shapes.push_back(base_shape);
       }
 
+      int compiled_count = 0;
       int loaded_count = 0;
-      for (size_t batch : batch_sizes_to_check) {
-        // Build input shapes with this batch size
+      for (size_t batch : batch_sizes_to_compile) {
+        // Skip if already in cache (from earlier pre-compilation)
+        if (batch_program_cache_[fused_node.Name()].find(batch) != batch_program_cache_[fused_node.Name()].end()) {
+          LOGS_DEFAULT(VERBOSE) << "[Compile] Batch size " << batch << " already in memory cache, skipping";
+          continue;
+        }
+
+        // Build input shapes with this batch size for ALL inputs (for correct hash)
         std::vector<std::int64_t> batch_input_shapes;
-        batch_input_shapes.push_back(static_cast<std::int64_t>(batch));
-        batch_input_shapes.insert(batch_input_shapes.end(), base_shape.begin(), base_shape.end());
+        for (size_t i = 0; i < all_input_base_shapes.size(); ++i) {
+          batch_input_shapes.push_back(static_cast<std::int64_t>(batch));  // Batch dimension
+          batch_input_shapes.insert(batch_input_shapes.end(), 
+                                    all_input_base_shapes[i].begin(), 
+                                    all_input_base_shapes[i].end());
+        }
 
         if (!batch_input_shapes.empty()) {
           auto batch_cache_hash = make_hash(batch_input_shapes);
           auto batch_cache_file = model_cache_path_ / (mxr_filename_prefix + batch_cache_hash + ".mxr");
 
+          LOGS_DEFAULT(VERBOSE) << "[Compile] Looking for batch " << batch << " cache file: " << batch_cache_file.string();
+
           migraphx::program batch_prog;
           if (load_precompiled_model(batch_prog, batch_cache_file)) {
+            // Successfully loaded from disk
             batch_program_cache_[fused_node.Name()][batch] = std::move(batch_prog);
             loaded_count++;
-            LOGS_DEFAULT(INFO) << "[Compile] Pre-loaded cached model for batch size " << batch;
+            LOGS_DEFAULT(INFO) << "[Compile] Loaded cached model for batch size " << batch;
           } else {
-            LOGS_DEFAULT(VERBOSE) << "[Compile] No cached model found for batch size " << batch;
+            // Cache miss - compile the program
+            LOGS_DEFAULT(INFO) << "[Compile] Compiling model for batch size " << batch << "...";
+            
+            try {
+              // Set input shapes for ALL inputs with the new batch size
+              migraphx::onnx_options batch_options = options;
+              for (size_t i = 0; i < input_names.size() && i < all_input_base_shapes.size(); ++i) {
+                std::vector<std::size_t> shape_with_batch;
+                shape_with_batch.push_back(batch);
+                for (auto dim : all_input_base_shapes[i]) {
+                  shape_with_batch.push_back(static_cast<std::size_t>(dim));
+                }
+                batch_options.set_input_parameter_shape(input_names[i], shape_with_batch);
+                
+                LOGS_DEFAULT(VERBOSE) << "[Compile] Set input '" << input_names[i] << "' shape for batch " << batch;
+              }
+
+#ifndef ENABLE_TRAINING_CORE
+#ifdef HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH
+              if (!model_path_.empty()) {
+                batch_options.set_external_data_path(model_path_.parent_path().string());
+              }
+#endif
+#endif
+              batch_prog = migraphx::parse_onnx_buffer(onnx_string_buffer, batch_options);
+              migraphx::program_parameters quant_params;
+
+              calibrate_and_quantize(batch_prog, t_, quant_params, fp16_enable_, bf16_enable_, int8_enable_,
+                                     fp8_enable_, int8_calibration_cache_available_, dynamic_range_map_);
+              compile_program(batch_prog, t_, exhaustive_tune_);
+
+              // Save to disk for future runs
+              save_compiled_model(batch_prog, batch_cache_file);
+              LOGS_DEFAULT(INFO) << "[Compile] Saved compiled model for batch " << batch << " to: " << batch_cache_file.string();
+
+              // Store in memory cache
+              batch_program_cache_[fused_node.Name()][batch] = std::move(batch_prog);
+              compiled_count++;
+
+            } catch (const std::exception& e) {
+              LOGS_DEFAULT(ERROR) << "[Compile] Failed to compile batch " << batch << ": " << e.what();
+            }
           }
         }
       }
 
-      if (loaded_count > 0) {
-        LOGS_DEFAULT(INFO) << "[Compile] Pre-loaded " << loaded_count << " cached models into memory";
-      } else {
-        LOGS_DEFAULT(INFO) << "[Compile] No pre-existing cached models found";
-      }
+      LOGS_DEFAULT(INFO) << "[Compile] Batch cache ready: " << loaded_count << " loaded from disk, " 
+                         << compiled_count << " newly compiled, "
+                         << batch_program_cache_[fused_node.Name()].size() << " total batch sizes available";
     }
 
     NodeComputeInfo compute_info;
@@ -1960,147 +1892,33 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
         }
 
         if (!found_in_cache) {
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Batch size " << requested_batch << " not in cache, need to compile";
-
-          std::filesystem::path model_cache_file;
-          // empty cache path means the MXR caching is disabled - always compile
-          if (!model_cache_path_.empty()) {
-            // Ensure input_shapes has all updated dimensions including new batch sizes
-            if (input_shapes.empty()) {
-              LOGS_DEFAULT(WARNING) << "[Compute] Input shapes vector is empty, rebuilding from current inputs";
-              for (auto&& name : param_shapes.names()) {
-                if (map_input_name_index.count(name) > 0) {
-                  auto input_tensor = ctx.GetInput(map_input_name_index[name]);
-                  auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-                  const auto tensor_shape = tensor_info.GetShape();
-                  input_shapes.insert(input_shapes.end(), tensor_shape.begin(), tensor_shape.end());
-                }
-              }
-            }
-
-            // Log the shapes being used for cache key generation
-            std::ostringstream shapes_str;
-            shapes_str << "[";
-            for (size_t i = 0; i < input_shapes.size(); ++i) {
-              if (i > 0) shapes_str << ", ";
-              shapes_str << input_shapes[i];
-            }
-            shapes_str << "]";
-            LOGS_DEFAULT(VERBOSE) << "[Compute] Cache key input shapes (including updated batch): " << shapes_str.str();
-
-            auto cache_hash = make_hash(input_shapes);
-            model_cache_file = mgx_state->model_cache_dir / (mxr_filename_prefix + cache_hash + ".mxr");
-            LOGS_DEFAULT(VERBOSE) << "[Compute] Looking for MXR file: " << model_cache_file.string();
-          }
-
-          if (!load_precompiled_model(prog, model_cache_file)) {
-            LOGS_DEFAULT(VERBOSE) << "[Compute] Cache miss. Compiling model with updated batch size";
-
-            // CRITICAL: Ensure ALL input parameter shapes are explicitly set as static shapes in cmp_options
-            // This must be done BEFORE parsing to treat dynamic shapes as static for compilation
-            // NOTE: Only set shapes for actual runtime input parameters, NOT for constants/initializers
-            // MIGraphX will automatically infer shapes for constants and intermediate tensors
-            LOGS_DEFAULT(VERBOSE) << "[Compute] Setting " << map_input_name_index.size()
-                                  << " input parameter shapes as static in MIGraphX options (excluding constants)";
-
-            for (auto& it : map_input_name_index) {
-              auto& name = it.first;
-              auto& index = it.second;
-              auto input_tensor = ctx.GetInput(index);
-              auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-              const auto tensor_shape = tensor_info.GetShape();
-              std::vector<std::size_t> ort_lens(tensor_shape.begin(), tensor_shape.end());
-
-              // Set shape as static parameter for MIGraphX compilation
-              // Only for actual input parameters - constants/initializers are handled by MIGraphX
-              cmp_options.set_input_parameter_shape(name, ort_lens);
-
-              LOGS_DEFAULT(VERBOSE) << "[Compute] Set static shape for input parameter '" << name << "': ["
-                                    << [&]() {
-                                        std::ostringstream ss;
-                                        for (size_t i = 0; i < ort_lens.size(); ++i) {
-                                          if (i > 0) ss << ", ";
-                                          ss << ort_lens[i];
-                                        }
-                                        return ss.str();
-                                      }() << "]";
-            }
-            LOGS_DEFAULT(VERBOSE) << "[Compute] All input parameter shapes set as static";
-            LOGS_DEFAULT(VERBOSE) << "[Compute] MIGraphX will infer shapes for constants and intermediate tensors";
-
-#ifndef ENABLE_TRAINING_CORE
-#ifdef HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH
-            cmp_options.set_external_data_path(model_path_.parent_path().string());
-#endif
-#endif
-            LOGS_DEFAULT(VERBOSE) << "[Compute] Parsing ONNX buffer with static input shapes";
-            prog = migraphx::parse_onnx_buffer(onnx_string, cmp_options);
-            LOGS_DEFAULT(VERBOSE) << "[Compute] ONNX parsing complete";
-
-            // Verify that MIGraphX parsed with correct shapes for input parameters
-            auto parsed_param_shapes = prog.get_parameter_shapes();
-            LOGS_DEFAULT(VERBOSE) << "[Compute] Verifying parsed parameter shapes ("
-                                  << parsed_param_shapes.size() << " total parameters):";
-            for (auto&& param_name : parsed_param_shapes.names()) {
-              auto shape = parsed_param_shapes[param_name];
-              auto lens = shape.lengths();
-              std::ostringstream ss;
-              ss << "[";
-              for (size_t i = 0; i < lens.size(); ++i) {
-                if (i > 0) ss << ", ";
-                ss << lens[i];
-              }
-              ss << "]";
-
-              // Distinguish between input parameters we set and constants MIGraphX inferred
-              bool is_input_param = (map_input_name_index.count(param_name) > 0);
-              LOGS_DEFAULT(VERBOSE) << "[Compute] Parameter '" << param_name << "' parsed shape: " << ss.str()
-                                    << (is_input_param ? " (input parameter)" : " (constant/internal)");
-            }
-
-            migraphx::program_parameters quant_params;
-
-          if ((int8_enable ^ fp8_enable) && int8_calibration_cache_available) {
-            auto local_param_shapes = prog.get_parameter_shapes();
-            // Add input parameter data and the values they're set to
-            for (auto&& name : local_param_shapes.names()) {
-              if (map_input_name_index.count(name) > 0) {
-                auto input_tensor = ctx.GetInput(map_input_name_index[name]);
-                auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-                const auto tensor_shape = tensor_info.GetShape();
-                const auto tensor_type = tensor_info.GetElementType();
-
-                migraphx_shape_datatype_t mgx_type;
-                getMIGraphXType(tensor_type, mgx_type);
-                auto mgx_s = local_param_shapes[name];
-
-                if (mgx_type != mgx_s.type()) {
-                  LOGS_DEFAULT(FATAL) << "MIGraphX: param type mismatch";
-                }
-                quant_params.add(name, migraphx::argument(local_param_shapes[name], const_cast<void*>(input_tensor.GetTensorRawData())));
-              }
+          // Batch size not found in pre-compiled cache - return error
+          // All batch sizes should be pre-compiled during Compile phase
+          LOGS_DEFAULT(ERROR) << "[Compute] Batch size " << requested_batch
+                              << " not found in pre-compiled cache. "
+                              << "Ensure max_dynamic_batch is set correctly and includes this batch size.";
+
+          // List available batch sizes in cache for debugging
+          {
+            std::lock_guard<std::mutex> lock(*mgx_state->batch_cache_mutex_ptr);
+            auto& batch_cache = *mgx_state->batch_program_cache_ptr;
+            std::ostringstream available;
+            available << "Available batch sizes: [";
+            bool first = true;
+            for (const auto& kv : batch_cache) {
+              if (!first) available << ", ";
+              available << kv.first;
+              first = false;
             }
+            available << "]";
+            LOGS_DEFAULT(ERROR) << "[Compute] " << available.str();
           }
-          calibrate_and_quantize(prog, t, quant_params, fp16_enable, bf16_enable, int8_enable,
-                                 fp8_enable, int8_calibration_cache_available, map_dynamic_range);
-          compile_program(prog, t, exhaustive_tune_);
-
-          // Save compiled model with batch-aware filename
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Saving compiled model with updated batch size to: "
-                                << model_cache_file.string();
-          save_compiled_model(prog, model_cache_file);
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Model saved to disk";
-        } else {
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Loaded MXR from disk: " << model_cache_file.string();
-        }
 
-        // Store in batch cache for future use
-        {
-          std::lock_guard<std::mutex> lock(*mgx_state->batch_cache_mutex_ptr);
-          (*mgx_state->batch_program_cache_ptr)[requested_batch] = prog;
-          LOGS_DEFAULT(VERBOSE) << "[Compute] Stored program in batch cache for batch size: " << requested_batch;
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+              "MIGraphX: Batch size ", requested_batch, " not pre-compiled. ",
+              "Set ORT_MIGRAPHX_MAX_DYNAMIC_BATCH environment variable to at least ", requested_batch,
+              " to pre-compile this batch size during model loading.");
         }
-      }
 
         mgx_state->prog = prog;
         param_shapes = prog.get_parameter_shapes();
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index 394744efd3508..c94eecf2a95df 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -148,7 +148,6 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   // Key: node_name, Value: map of batch_size -> program
   std::unordered_map<std::string, std::map<size_t, migraphx::program>> batch_program_cache_;
   std::mutex batch_cache_mutex_;  // Protect batch_program_cache_
-  bool precompile_done_ = false;  // Track if we've done initial precompilation
 
   AllocatorPtr allocator_;
   std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;

From 3285f836dc6f546641b37b219349107f394f8429 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Fri, 2 Jan 2026 15:22:23 -0600
Subject: [PATCH 6/9] [AI Generated] Restore CompileProgramWithBatch and
 improve batch compilation logic

Key changes:
1. Restored CompileProgramWithBatch helper function (now handles ALL inputs,
   not just the first one)
2. When max_dynamic_batch > 0: compile power-of-2 batch sizes up to max
3. When max_dynamic_batch == 0 (not set): only compile the single batch size
   from the model's input shape
4. Uses CompileProgramWithBatch for cleaner compilation with proper input
   shape handling for multi-input models
5. Always compiles if load_precompiled_model fails for any batch size

This ensures:
- Models with max_dynamic_batch set get all power-of-2 batch sizes compiled
- Models without max_dynamic_batch only compile the necessary batch size
- All compiled programs are stored in batch cache before compute threads start
---
 .../migraphx/migraphx_execution_provider.cc   | 161 ++++++++++++------
 1 file changed, 109 insertions(+), 52 deletions(-)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 51290bb747bed..08b89e1b5cc9c 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1341,6 +1341,63 @@ std::vector<size_t> GetPowerOf2BatchSizes(size_t max_batch) {
   return batch_sizes;
 }
 
+// Helper: Compile a single program with specific batch size for all inputs
+migraphx::program CompileProgramWithBatch(
+    const std::string& onnx_string,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::vector<std::int64_t>>& all_input_base_shapes,
+    size_t batch_size,
+    migraphx::onnx_options options,
+    const migraphx::target& t,
+    bool fp16_enable,
+    bool bf16_enable,
+    bool int8_enable,
+    bool fp8_enable,
+    bool int8_calibration_cache_available,
+    std::unordered_map<std::string, float>& dynamic_range_map,
+    bool exhaustive_tune,
+    const std::filesystem::path& model_path) {
+
+  LOGS_DEFAULT(VERBOSE) << "[CompileBatch] Compiling for batch size: " << batch_size;
+
+  // Set input shapes with the specified batch size for ALL inputs
+  for (size_t i = 0; i < input_names.size() && i < all_input_base_shapes.size(); ++i) {
+    std::vector<std::size_t> shape_with_batch;
+    shape_with_batch.push_back(batch_size);
+    for (auto dim : all_input_base_shapes[i]) {
+      shape_with_batch.push_back(static_cast<std::size_t>(dim));
+    }
+    options.set_input_parameter_shape(input_names[i], shape_with_batch);
+
+    std::ostringstream ss;
+    ss << "[";
+    for (size_t j = 0; j < shape_with_batch.size(); ++j) {
+      if (j > 0) ss << ", ";
+      ss << shape_with_batch[j];
+    }
+    ss << "]";
+    LOGS_DEFAULT(VERBOSE) << "[CompileBatch] Input '" << input_names[i] << "' shape: " << ss.str();
+  }
+
+#ifndef ENABLE_TRAINING_CORE
+#ifdef HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH
+  if (!model_path.empty()) {
+    options.set_external_data_path(model_path.parent_path().string());
+  }
+#endif
+#endif
+
+  migraphx::program prog = migraphx::parse_onnx_buffer(onnx_string, options);
+  migraphx::program_parameters quant_params;
+
+  calibrate_and_quantize(prog, t, quant_params, fp16_enable, bf16_enable, int8_enable,
+                         fp8_enable, int8_calibration_cache_available, dynamic_range_map);
+  compile_program(prog, t, exhaustive_tune);
+
+  LOGS_DEFAULT(VERBOSE) << "[CompileBatch] Compilation complete for batch size: " << batch_size;
+  return prog;
+}
+
 Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
                                           std::vector<NodeComputeInfo>& node_compute_funcs) {
   migraphx::onnx_options options;
@@ -1574,32 +1631,54 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       batch_program_cache_[fused_node.Name()] = std::map<size_t, migraphx::program>();
     }
 
-    // Pre-compile/load programs for all batch sizes if max_dynamic_batch_ > 1
-    // This ensures all batch sizes are available in memory before compute threads start
-    if (max_dynamic_batch_ > 1 && !model_cache_path_.empty() && !no_input_shape) {
-      LOGS_DEFAULT(INFO) << "[Compile] Ensuring all batch sizes up to " << max_dynamic_batch_ << " are compiled and cached...";
-      std::lock_guard<std::mutex> lock(batch_cache_mutex_);
+    // Build base shapes for ALL inputs (excluding batch dimension)
+    // This is needed for both single batch and multi-batch compilation
+    std::vector<std::vector<std::int64_t>> all_input_base_shapes;
+    for (size_t i = 0; i < input_tensor.size(); ++i) {
+      std::vector<std::int64_t> base_shape;
+      if (input_tensor[i]->Shape() != nullptr) {
+        auto tensor_shape = input_tensor[i]->Shape();
+        for (int j = 1; j < tensor_shape->dim_size(); ++j) {
+          const auto& dim = tensor_shape->dim(j);
+          if (dim.has_dim_value()) {
+            base_shape.push_back(dim.dim_value());
+          } else {
+            base_shape.push_back(1);  // Default for symbolic dims
+          }
+        }
+      }
+      all_input_base_shapes.push_back(base_shape);
+    }
 
-      // Generate batch sizes to compile (powers of 2 up to max_dynamic_batch_)
-      auto batch_sizes_to_compile = GetPowerOf2BatchSizes(max_dynamic_batch_);
+    // Pre-compile/load programs for batch sizes
+    // If max_dynamic_batch_ > 0: compile power-of-2 batch sizes up to max_dynamic_batch_
+    // If max_dynamic_batch_ == 0: only compile the single batch size from the model input
+    if (!model_cache_path_.empty() && !no_input_shape) {
+      std::lock_guard<std::mutex> lock(batch_cache_mutex_);
 
-      // Build base shapes for ALL inputs (excluding batch dimension)
-      // This ensures hash is calculated correctly for multi-input models
-      std::vector<std::vector<std::int64_t>> all_input_base_shapes;
-      for (size_t i = 0; i < input_tensor.size(); ++i) {
-        std::vector<std::int64_t> base_shape;
-        if (input_tensor[i]->Shape() != nullptr) {
-          auto tensor_shape = input_tensor[i]->Shape();
-          for (int j = 1; j < tensor_shape->dim_size(); ++j) {
-            const auto& dim = tensor_shape->dim(j);
-            if (dim.has_dim_value()) {
-              base_shape.push_back(dim.dim_value());
-            } else {
-              base_shape.push_back(1);  // Default for symbolic dims
+      // Determine which batch sizes to compile
+      std::vector<size_t> batch_sizes_to_compile;
+      if (max_dynamic_batch_ > 0) {
+        // Compile power-of-2 batch sizes up to max_dynamic_batch_
+        batch_sizes_to_compile = GetPowerOf2BatchSizes(max_dynamic_batch_);
+        LOGS_DEFAULT(INFO) << "[Compile] Compiling " << batch_sizes_to_compile.size() 
+                           << " batch sizes (powers of 2 up to " << max_dynamic_batch_ << ")";
+      } else {
+        // Only compile the single batch size from the model's input shape
+        // Extract the batch size from the first input tensor
+        size_t single_batch = 1;  // Default
+        if (!input_tensor.empty() && input_tensor[0]->Shape() != nullptr) {
+          auto tensor_shape = input_tensor[0]->Shape();
+          if (tensor_shape->dim_size() > 0) {
+            const auto& batch_dim = tensor_shape->dim(0);
+            if (batch_dim.has_dim_value()) {
+              single_batch = static_cast<size_t>(batch_dim.dim_value());
             }
           }
         }
-        all_input_base_shapes.push_back(base_shape);
+        batch_sizes_to_compile.push_back(single_batch);
+        LOGS_DEFAULT(INFO) << "[Compile] Compiling single batch size: " << single_batch 
+                           << " (max_dynamic_batch not set)";
       }
 
       int compiled_count = 0;
@@ -1615,8 +1694,8 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
         std::vector<std::int64_t> batch_input_shapes;
         for (size_t i = 0; i < all_input_base_shapes.size(); ++i) {
           batch_input_shapes.push_back(static_cast<std::int64_t>(batch));  // Batch dimension
-          batch_input_shapes.insert(batch_input_shapes.end(), 
-                                    all_input_base_shapes[i].begin(), 
+          batch_input_shapes.insert(batch_input_shapes.end(),
+                                    all_input_base_shapes[i].begin(),
                                     all_input_base_shapes[i].end());
         }
 
@@ -1633,36 +1712,14 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
             loaded_count++;
             LOGS_DEFAULT(INFO) << "[Compile] Loaded cached model for batch size " << batch;
           } else {
-            // Cache miss - compile the program
+            // Cache miss - compile the program using CompileProgramWithBatch
             LOGS_DEFAULT(INFO) << "[Compile] Compiling model for batch size " << batch << "...";
-            
-            try {
-              // Set input shapes for ALL inputs with the new batch size
-              migraphx::onnx_options batch_options = options;
-              for (size_t i = 0; i < input_names.size() && i < all_input_base_shapes.size(); ++i) {
-                std::vector<std::size_t> shape_with_batch;
-                shape_with_batch.push_back(batch);
-                for (auto dim : all_input_base_shapes[i]) {
-                  shape_with_batch.push_back(static_cast<std::size_t>(dim));
-                }
-                batch_options.set_input_parameter_shape(input_names[i], shape_with_batch);
-                
-                LOGS_DEFAULT(VERBOSE) << "[Compile] Set input '" << input_names[i] << "' shape for batch " << batch;
-              }
 
-#ifndef ENABLE_TRAINING_CORE
-#ifdef HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH
-              if (!model_path_.empty()) {
-                batch_options.set_external_data_path(model_path_.parent_path().string());
-              }
-#endif
-#endif
-              batch_prog = migraphx::parse_onnx_buffer(onnx_string_buffer, batch_options);
-              migraphx::program_parameters quant_params;
-
-              calibrate_and_quantize(batch_prog, t_, quant_params, fp16_enable_, bf16_enable_, int8_enable_,
-                                     fp8_enable_, int8_calibration_cache_available_, dynamic_range_map_);
-              compile_program(batch_prog, t_, exhaustive_tune_);
+            try {
+              batch_prog = CompileProgramWithBatch(
+                  onnx_string_buffer, input_names, all_input_base_shapes, batch,
+                  options, t_, fp16_enable_, bf16_enable_, int8_enable_, fp8_enable_,
+                  int8_calibration_cache_available_, dynamic_range_map_, exhaustive_tune_, model_path_);
 
               // Save to disk for future runs
               save_compiled_model(batch_prog, batch_cache_file);
@@ -1679,7 +1736,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
         }
       }
 
-      LOGS_DEFAULT(INFO) << "[Compile] Batch cache ready: " << loaded_count << " loaded from disk, " 
+      LOGS_DEFAULT(INFO) << "[Compile] Batch cache ready: " << loaded_count << " loaded from disk, "
                          << compiled_count << " newly compiled, "
                          << batch_program_cache_[fused_node.Name()].size() << " total batch sizes available";
     }

From 23d34e7af3be8a56f54d89ada7fd405f7dfb1564 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Fri, 2 Jan 2026 15:23:52 -0600
Subject: [PATCH 7/9] [AI Generated] Revert hash calculation to use first input
 shape only

Reverted the hash calculation for batch cache files to use only the first
input's shape (with batch dimension) instead of all inputs. This matches
the previous behavior.
---
 .../migraphx/migraphx_execution_provider.cc        | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 08b89e1b5cc9c..c69b66384bd8f 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1661,7 +1661,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       if (max_dynamic_batch_ > 0) {
         // Compile power-of-2 batch sizes up to max_dynamic_batch_
         batch_sizes_to_compile = GetPowerOf2BatchSizes(max_dynamic_batch_);
-        LOGS_DEFAULT(INFO) << "[Compile] Compiling " << batch_sizes_to_compile.size() 
+        LOGS_DEFAULT(INFO) << "[Compile] Compiling " << batch_sizes_to_compile.size()
                            << " batch sizes (powers of 2 up to " << max_dynamic_batch_ << ")";
       } else {
         // Only compile the single batch size from the model's input shape
@@ -1677,7 +1677,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
           }
         }
         batch_sizes_to_compile.push_back(single_batch);
-        LOGS_DEFAULT(INFO) << "[Compile] Compiling single batch size: " << single_batch 
+        LOGS_DEFAULT(INFO) << "[Compile] Compiling single batch size: " << single_batch
                            << " (max_dynamic_batch not set)";
       }
 
@@ -1690,13 +1690,13 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
           continue;
         }
 
-        // Build input shapes with this batch size for ALL inputs (for correct hash)
+        // Build input shapes with this batch size (using first input's shape for hash)
         std::vector<std::int64_t> batch_input_shapes;
-        for (size_t i = 0; i < all_input_base_shapes.size(); ++i) {
-          batch_input_shapes.push_back(static_cast<std::int64_t>(batch));  // Batch dimension
+        batch_input_shapes.push_back(static_cast<std::int64_t>(batch));  // Batch dimension
+        if (!all_input_base_shapes.empty()) {
           batch_input_shapes.insert(batch_input_shapes.end(),
-                                    all_input_base_shapes[i].begin(),
-                                    all_input_base_shapes[i].end());
+                                    all_input_base_shapes[0].begin(),
+                                    all_input_base_shapes[0].end());
         }
 
         if (!batch_input_shapes.empty()) {

From 68034364c0a9b72bbb3283ca07620aab14d89f00 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Fri, 2 Jan 2026 15:31:06 -0600
Subject: [PATCH 8/9] [AI Generated] Fix batch cache not being populated for
 compute threads

The batch cache was empty because:
1. Pre-compilation block only ran when model_cache_path_ was not empty
2. The main compiled program was never stored in batch_program_cache_

Fix:
1. Always store the main compiled program in batch_program_cache_ with its
   batch size (extracted from first input tensor)
2. Pre-compilation of additional batch sizes only runs when max_dynamic_batch > 0
   AND model_cache_path_ is set
3. When max_dynamic_batch == 0, we still have at least the main batch size
   in the cache from the main compilation

This ensures compute threads always have at least one batch size available
in the batch cache.
---
 .../migraphx/migraphx_execution_provider.cc   | 61 +++++++++----------
 1 file changed, 29 insertions(+), 32 deletions(-)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index c69b66384bd8f..ac42fdb266b18 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1626,10 +1626,8 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
     map_input_index_[fused_node.Name()] = input_name_index;
     map_no_input_shape_[fused_node.Name()] = no_input_shape;
 
-    // Initialize batch program cache for this node (only if not already initialized)
-    if (batch_program_cache_.find(fused_node.Name()) == batch_program_cache_.end()) {
-      batch_program_cache_[fused_node.Name()] = std::map<size_t, migraphx::program>();
-    }
+    // Initialize batch program cache for this node
+    batch_program_cache_[fused_node.Name()] = std::map<size_t, migraphx::program>();
 
     // Build base shapes for ALL inputs (excluding batch dimension)
     // This is needed for both single batch and multi-batch compilation
@@ -1650,36 +1648,35 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       all_input_base_shapes.push_back(base_shape);
     }
 
-    // Pre-compile/load programs for batch sizes
-    // If max_dynamic_batch_ > 0: compile power-of-2 batch sizes up to max_dynamic_batch_
-    // If max_dynamic_batch_ == 0: only compile the single batch size from the model input
-    if (!model_cache_path_.empty() && !no_input_shape) {
-      std::lock_guard<std::mutex> lock(batch_cache_mutex_);
-
-      // Determine which batch sizes to compile
-      std::vector<size_t> batch_sizes_to_compile;
-      if (max_dynamic_batch_ > 0) {
-        // Compile power-of-2 batch sizes up to max_dynamic_batch_
-        batch_sizes_to_compile = GetPowerOf2BatchSizes(max_dynamic_batch_);
-        LOGS_DEFAULT(INFO) << "[Compile] Compiling " << batch_sizes_to_compile.size()
-                           << " batch sizes (powers of 2 up to " << max_dynamic_batch_ << ")";
-      } else {
-        // Only compile the single batch size from the model's input shape
-        // Extract the batch size from the first input tensor
-        size_t single_batch = 1;  // Default
-        if (!input_tensor.empty() && input_tensor[0]->Shape() != nullptr) {
-          auto tensor_shape = input_tensor[0]->Shape();
-          if (tensor_shape->dim_size() > 0) {
-            const auto& batch_dim = tensor_shape->dim(0);
-            if (batch_dim.has_dim_value()) {
-              single_batch = static_cast<size_t>(batch_dim.dim_value());
-            }
-          }
+    // Extract the batch size from the main compiled program (from first input)
+    size_t main_prog_batch_size = 1;  // Default
+    if (!input_tensor.empty() && input_tensor[0]->Shape() != nullptr) {
+      auto tensor_shape = input_tensor[0]->Shape();
+      if (tensor_shape->dim_size() > 0) {
+        const auto& batch_dim = tensor_shape->dim(0);
+        if (batch_dim.has_dim_value()) {
+          main_prog_batch_size = static_cast<size_t>(batch_dim.dim_value());
         }
-        batch_sizes_to_compile.push_back(single_batch);
-        LOGS_DEFAULT(INFO) << "[Compile] Compiling single batch size: " << single_batch
-                           << " (max_dynamic_batch not set)";
       }
+    }
+
+    // Always store the main compiled program in the batch cache
+    // This ensures at least one batch size is always available
+    if (!no_input_shape) {
+      std::lock_guard<std::mutex> lock(batch_cache_mutex_);
+      batch_program_cache_[fused_node.Name()][main_prog_batch_size] = prog;
+      LOGS_DEFAULT(INFO) << "[Compile] Stored main program in batch cache for batch size: " << main_prog_batch_size;
+    }
+
+    // Pre-compile/load additional programs for other batch sizes when max_dynamic_batch_ > 0
+    // This compiles power-of-2 batch sizes up to max_dynamic_batch_
+    if (!model_cache_path_.empty() && !no_input_shape && max_dynamic_batch_ > 0) {
+      std::lock_guard<std::mutex> lock(batch_cache_mutex_);
+
+      // Compile power-of-2 batch sizes up to max_dynamic_batch_
+      auto batch_sizes_to_compile = GetPowerOf2BatchSizes(max_dynamic_batch_);
+      LOGS_DEFAULT(INFO) << "[Compile] Pre-compiling " << batch_sizes_to_compile.size()
+                         << " batch sizes (powers of 2 up to " << max_dynamic_batch_ << ")";
 
       int compiled_count = 0;
       int loaded_count = 0;

From fb2067b78762a464fb2040c25d7082c1f65e5a44 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous <tthemist@amd.com>
Date: Fri, 2 Jan 2026 15:42:34 -0600
Subject: [PATCH 9/9] [AI Generated] Fix batch cache empty when model has
 dynamic/symbolic shapes

When no_input_shape is true (model has dynamic/symbolic dimensions), the
code was previously deferring compilation and leaving prog empty. This
caused the batch cache to be empty, and compute threads would fail.

Fix:
1. When no_input_shape is true, still compile the model with default shapes
   (batch size 1) so we always have something in the batch cache
2. Always store the main compiled program in the batch cache (removed the
   !no_input_shape condition)

This ensures compute threads always find at least batch size 1 in the cache,
even for models with dynamic shapes.
---
 .../migraphx/migraphx_execution_provider.cc    | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index ac42fdb266b18..19ae040bc4443 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1615,8 +1615,20 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
       // NOTE: DO NOT set output shapes as input parameters!
       // Outputs are dynamically inferred by MIGraphX based on input shapes
     } else {
-      LOGS_DEFAULT(VERBOSE) << "[Compile] Deferring compilation until runtime (no static input shapes available)";
-      LOGS_DEFAULT(VERBOSE) << "[Compile] Will use default batch size of 1, then recompile with actual batch at runtime";
+      LOGS_DEFAULT(INFO) << "[Compile] No static input shapes available, compiling with default batch size 1";
+      // Still compile with default shapes so we have something in the batch cache
+#ifndef ENABLE_TRAINING_CORE
+#ifdef HAVE_MIGRAPHX_API_ONNX_OPTIONS_SET_EXTERNAL_DATA_PATH
+      options.set_external_data_path(model_path_.parent_path().string());
+#endif
+#endif
+      prog = migraphx::parse_onnx_buffer(onnx_string_buffer, options);
+      migraphx::program_parameters quant_params;
+
+      calibrate_and_quantize(prog, t_, quant_params, fp16_enable_, bf16_enable_, int8_enable_,
+                             fp8_enable_, int8_calibration_cache_available_, dynamic_range_map_);
+      compile_program(prog, t_, exhaustive_tune_);
+      LOGS_DEFAULT(INFO) << "[Compile] Compiled model with default shapes";
     }
 
     // compile the program
@@ -1662,7 +1674,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
 
     // Always store the main compiled program in the batch cache
     // This ensures at least one batch size is always available
-    if (!no_input_shape) {
+    {
       std::lock_guard<std::mutex> lock(batch_cache_mutex_);
       batch_program_cache_[fused_node.Name()][main_prog_batch_size] = prog;
       LOGS_DEFAULT(INFO) << "[Compile] Stored main program in batch cache for batch size: " << main_prog_batch_size;