From 452b4933b156c8ac5d8948fc5c4a5eef95594702 Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Thu, 8 Jan 2026 01:51:50 +0000 Subject: [PATCH 1/2] add some more improvements --- .../llvm-project/mlir/lib/IR/MLIRContext.cpp | 2 +- .../rocmlir-tuning-driver.cpp | 50 ++++++++++++++----- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/external/llvm-project/mlir/lib/IR/MLIRContext.cpp b/external/llvm-project/mlir/lib/IR/MLIRContext.cpp index 8dc910552cd9..6825eba9a514 100644 --- a/external/llvm-project/mlir/lib/IR/MLIRContext.cpp +++ b/external/llvm-project/mlir/lib/IR/MLIRContext.cpp @@ -76,7 +76,7 @@ struct MLIRContextOptions { static llvm::ManagedStatic clOptions; static bool isThreadingGloballyDisabled() { -#if MLIR_ENABLE_THREADS != 0 +#if LLVM_ENABLE_THREADS!= 0 return clOptions.isConstructed() && clOptions->disableThreading; #else return true; diff --git a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp index 2d41c13a6038..34bdadba2704 100644 --- a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp +++ b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp @@ -47,6 +47,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/ThreadPool.h" #include #include @@ -88,6 +89,26 @@ void pArgs(const std::tuple &formals, void **_vargs) { using namespace mlir; using namespace rocmlir::tuningdriver; +//===----------------------------------------------------------------------===// +// Shared Resources for Multi-threaded Compilation +//===----------------------------------------------------------------------===// + +/// Returns a shared dialect registry, initialized exactly once. +static DialectRegistry &getSharedDialectRegistry() { + static std::once_flag initFlag; + static DialectRegistry registry; + std::call_once(initFlag, []() { + registerRocMLIRDialects(registry); + }); + return registry; +} + +/// Returns a shared LLVM ThreadPool for all MLIR contexts. +static llvm::DefaultThreadPool &getSharedThreadPool() { + static llvm::DefaultThreadPool pool; + return pool; +} + static llvm::cl::opt inputFilename{ llvm::cl::Positional, llvm::cl::desc(""), llvm::cl::init("-")}; @@ -316,9 +337,12 @@ struct ThreadResources { const rock::KernelOptions &applicabilityOpts, const rock::KernelOptions &compilationKernOpts, const rock::BackendOptions &backendOpts) { - DialectRegistry registry; - registerRocMLIRDialects(registry); - ctx = std::make_unique(registry); + // Use the shared dialect registry (initialized exactly once) + DialectRegistry ®istry = getSharedDialectRegistry(); + // Create context with threading disabled internally, attach shared pool + ctx = std::make_unique(registry, MLIRContext::Threading::DISABLED); + ctx->setThreadPool(getSharedThreadPool()); + ctx->loadAllAvailableDialects(); ctx->getDiagEngine().registerHandler([](Diagnostic &) {}); // Pre-build pipelines once per thread @@ -426,9 +450,9 @@ measureLargeKernel(unsigned iterations, hipStream_t stream, static FailureOr benchmarkKernels(ArrayRef binaries, ArrayRef funcNames, ArrayRef blockSizes, - ArrayRef gridSizes, ArrayRef hostBuffers, + ArrayRef gridSizes, MutableArrayRef gpuBuffers, - ArrayRef bufferSizes, const BenchmarkParams ¶ms) { + const BenchmarkParams ¶ms) { bool benchmarkMode = !params.benchmarkConfig.empty(); hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); @@ -440,12 +464,6 @@ benchmarkKernels(ArrayRef binaries, } }); - // Initialize device buffers - for (size_t i = 0; i < bufferSizes.size(); i++) { - HIPCHECK(hipMemcpyAsync(gpuBuffers[i], hostBuffers[i], bufferSizes[i], - hipMemcpyHostToDevice, stream)); - } - // HIP wants an array of pointers to each argument std::vector argPointers; for (void *&item : gpuBuffers) { @@ -731,6 +749,12 @@ static LogicalResult runTuningLoop(ModuleOp source) { gpuBuffers.push_back(gpuBuffer); } + // Copy host buffers to GPU once (reused across all config benchmarks) + for (size_t i = 0; i < bufferLengths.size(); i++) { + HIPCHECK(hipMemcpy(gpuBuffers[i], hostBuffers[i], bufferLengths[i], + hipMemcpyHostToDevice)); + } + // 4. Multi-iteration tuning loop SmallString<64> bestConfigOverall; float bestTimeOverall = std::numeric_limits::max(); @@ -977,8 +1001,8 @@ static LogicalResult runTuningLoop(ModuleOp source) { FailureOr timing = benchmarkKernels(result.hipModules, kernelFuncNames, - result.blockSizes, result.gridSizes, hostBuffers, - gpuBuffers, bufferLengths, benchmarkParams); + result.blockSizes, result.gridSizes, + gpuBuffers, benchmarkParams); if (failed(timing)) { llvm::errs() << "Kernel execution failed\n"; From b586e68200029b916014ad0ef77f411966d1efef Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Thu, 8 Jan 2026 01:54:56 +0000 Subject: [PATCH 2/2] formatting --- .../llvm-project/mlir/lib/IR/MLIRContext.cpp | 2 +- .../rocmlir-tuning-driver.cpp | 26 +++++++++---------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/external/llvm-project/mlir/lib/IR/MLIRContext.cpp b/external/llvm-project/mlir/lib/IR/MLIRContext.cpp index 6825eba9a514..37f9f97333fa 100644 --- a/external/llvm-project/mlir/lib/IR/MLIRContext.cpp +++ b/external/llvm-project/mlir/lib/IR/MLIRContext.cpp @@ -76,7 +76,7 @@ struct MLIRContextOptions { static llvm::ManagedStatic clOptions; static bool isThreadingGloballyDisabled() { -#if LLVM_ENABLE_THREADS!= 0 +#if LLVM_ENABLE_THREADS != 0 return clOptions.isConstructed() && clOptions->disableThreading; #else return true; diff --git a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp index 34bdadba2704..f3291694c4fd 100644 --- a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp +++ b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp @@ -97,9 +97,7 @@ using namespace rocmlir::tuningdriver; static DialectRegistry &getSharedDialectRegistry() { static std::once_flag initFlag; static DialectRegistry registry; - std::call_once(initFlag, []() { - registerRocMLIRDialects(registry); - }); + std::call_once(initFlag, []() { registerRocMLIRDialects(registry); }); return registry; } @@ -340,7 +338,8 @@ struct ThreadResources { // Use the shared dialect registry (initialized exactly once) DialectRegistry ®istry = getSharedDialectRegistry(); // Create context with threading disabled internally, attach shared pool - ctx = std::make_unique(registry, MLIRContext::Threading::DISABLED); + ctx = std::make_unique(registry, + MLIRContext::Threading::DISABLED); ctx->setThreadPool(getSharedThreadPool()); ctx->loadAllAvailableDialects(); ctx->getDiagEngine().registerHandler([](Diagnostic &) {}); @@ -447,12 +446,12 @@ measureLargeKernel(unsigned iterations, hipStream_t stream, } // In order to match rocprof, returns time in nanoseconds -static FailureOr -benchmarkKernels(ArrayRef binaries, - ArrayRef funcNames, ArrayRef blockSizes, - ArrayRef gridSizes, - MutableArrayRef gpuBuffers, - const BenchmarkParams ¶ms) { +static FailureOr benchmarkKernels(ArrayRef binaries, + ArrayRef funcNames, + ArrayRef blockSizes, + ArrayRef gridSizes, + MutableArrayRef gpuBuffers, + const BenchmarkParams ¶ms) { bool benchmarkMode = !params.benchmarkConfig.empty(); hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); @@ -999,10 +998,9 @@ static LogicalResult runTuningLoop(ModuleOp source) { assert(result.status == CompilationStatus::Success && "Unexpected compilation status in benchmarking phase"); - FailureOr timing = - benchmarkKernels(result.hipModules, kernelFuncNames, - result.blockSizes, result.gridSizes, - gpuBuffers, benchmarkParams); + FailureOr timing = benchmarkKernels( + result.hipModules, kernelFuncNames, result.blockSizes, + result.gridSizes, gpuBuffers, benchmarkParams); if (failed(timing)) { llvm::errs() << "Kernel execution failed\n";