From e67a96b460674133f441dbcc78284a1fa79b3396 Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Fri, 31 Oct 2025 03:21:39 -0700 Subject: [PATCH 01/12] Reorder KV cache using the new gather_by_axis API --- .../core/providers/openvino/ov_interface.cc | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index a57db77c37dfa..9350d348450cf 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -507,6 +507,29 @@ void StatefulOVInferRequest::Infer() { OVInferRequest::Infer(); } +void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { + // Validate input parameters + if (src_indices.size() != dst_indices.size()) { + ORT_THROW(log_tag + "ReorderKVCache: src_indices and dst_indices must have the same size. " + "Got src_indices.size()=" + std::to_string(src_indices.size()) + + ", dst_indices.size()=" + std::to_string(dst_indices.size())); + } + + LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with " + << src_indices.size() << " index pairs"; + + // Retrieve KVCache states and reorder them based on the provided indices + auto states = ovInfReq.query_state(); + + for (auto& state : states) { + auto start_time = std::chrono::high_resolution_clock::now(); + state.gather_by_axis(src_indices, dst_indices); + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time).count(); + LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds"; + } +} + void StatefulOVInferRequest::RewindKVCache(size_t index) { LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; From 349eff9a7e20388da854fec73bc09e9627fa5833 Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Thu, 13 Nov 2025 06:07:10 -0800 Subject: [PATCH 02/12] Do a ScatterElementsUpdate-based reorder during execution --- .../core/providers/openvino/ov_interface.cc | 34 ++++++++++++++++++- .../core/providers/openvino/ov_interface.h | 2 ++ .../openvino/ov_stateful_patch_utils.cc | 17 ++++++++-- .../openvino/ov_stateful_patch_utils.h | 1 + 4 files changed, 50 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 9350d348450cf..0090783d1e088 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -467,7 +467,25 @@ std::optional StatefulOVInferRequest::FindTensor(const std::string& void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - FillTensor("beam_idx", ov::element::i32, {1}, 0); + if (beam_idx_val.size() == 3) { + ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {3}); + for (int i = 0; i < 3; ++i) { + beam_idx_tensor.data()[i] = int32_t(beam_idx_val[i]); + } + ovInfReq.set_tensor("beam_idx", beam_idx_tensor); + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, 3, 96}); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 32; ++j) { + for (int k = 0; k < 96; ++k) { + dst_idx_tensor.data()[(j * 3 + i) * 96 + k] = int32_t(dst_idx_val[i]); + } + } + } + ovInfReq.set_tensor("dst_idx", dst_idx_tensor); + } else { + FillTensor("beam_idx", ov::element::i32, {3}, 0); + FillTensor("dst_idx", ov::element::i32, {1, 32, 3, 96}, 0); + } // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. if (prefill_use_full_chat_history) { @@ -518,6 +536,19 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with " << src_indices.size() << " index pairs"; + // set beam_idx and dst_idx based on provided values + if (beam_idx_val.size() == 0) { + for (int i = 0; i < 3; ++i) { + beam_idx_val.emplace_back(src_indices[i]); + dst_idx_val.emplace_back(dst_indices[i]); + } + } else { + for (int i = 0; i < 3; ++i) { + beam_idx_val[i] = src_indices[i]; + dst_idx_val[i] = dst_indices[i]; + } + } + /* // Retrieve KVCache states and reorder them based on the provided indices auto states = ovInfReq.query_state(); @@ -528,6 +559,7 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic auto duration = std::chrono::duration_cast(end_time - start_time).count(); LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds"; } + */ } void StatefulOVInferRequest::RewindKVCache(size_t index) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index aa4b3fbe64898..8073157e4450e 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -160,6 +160,8 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; + std::vector beam_idx_val; + std::vector dst_idx_val; bool IsNPULogitsSliceRequired(); bool _npu_logits_slice_required = false; diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index fd2b5797a1f40..a3555201f3939 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -91,13 +91,21 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates); auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0]; + auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape(); + update_shape[2] = 3; - auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({std::move(input_batch)})); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({3})); beam_idx->set_friendly_name("beam_idx"); beam_idx->output(0).get_tensor().add_names({"beam_idx"}); ov_model->add_parameters({beam_idx}); not_kv_inputs.push_back(beam_idx->get_friendly_name()); + auto dst_idx = std::make_shared(ov::element::i32, update_shape); + dst_idx->set_friendly_name("dst_idx"); + dst_idx->output(0).get_tensor().add_names({"dst_idx"}); + ov_model->add_parameters({dst_idx}); + not_kv_inputs.push_back(dst_idx->get_friendly_name()); + // Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx for (const auto& input_name : key_value_input_names) { auto parameter_output_port = ov_model->input(input_name); @@ -106,11 +114,14 @@ void FuseCacheReorder(std::shared_ptr ov_model, auto gather_op = std::make_shared(parameter_output_port, beam_idx, - ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); + ov::opset13::Constant::create(ov::element::i64, {}, {2})); + + auto update_op = std::make_shared(parameter_output_port, + dst_idx, gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); // Replace the source output for all consumers of the input tensor for (auto& consumer : consumers) { - consumer.replace_source_output(gather_op->output(0)); + consumer.replace_source_output(update_op->output(0)); } } diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index 0b89c4ed02e13..11b0cc1dbe9bb 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -13,6 +13,7 @@ #include "openvino/pass/manager.hpp" #include "openvino/pass/make_stateful.hpp" +#include "openvino/opsets/opset12.hpp" #include "openvino/opsets/opset13.hpp" namespace onnxruntime { From 49ed42a61da9dd889dafb6a70cc6d08eab109f06 Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Thu, 13 Nov 2025 09:47:05 -0800 Subject: [PATCH 03/12] Get variable update lengths from incoming indices --- .../core/providers/openvino/ov_interface.cc | 31 ++++++++----------- .../openvino/ov_stateful_patch_utils.cc | 4 +-- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 0090783d1e088..2df8f8b412a91 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -467,24 +467,24 @@ std::optional StatefulOVInferRequest::FindTensor(const std::string& void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - if (beam_idx_val.size() == 3) { - ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {3}); - for (int i = 0; i < 3; ++i) { + if (beam_idx_val.size() > 0) { + ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {beam_idx_val.size()}); + for (int i = 0; i < beam_idx_val.size(); ++i) { beam_idx_tensor.data()[i] = int32_t(beam_idx_val[i]); } ovInfReq.set_tensor("beam_idx", beam_idx_tensor); - ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, 3, 96}); - for (int i = 0; i < 3; ++i) { + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96}); + for (int i = 0; i < dst_idx_val.size(); ++i) { for (int j = 0; j < 32; ++j) { for (int k = 0; k < 96; ++k) { - dst_idx_tensor.data()[(j * 3 + i) * 96 + k] = int32_t(dst_idx_val[i]); + dst_idx_tensor.data()[(j * dst_idx_val.size() + i) * 96 + k] = int32_t(dst_idx_val[i]); } } } ovInfReq.set_tensor("dst_idx", dst_idx_tensor); } else { - FillTensor("beam_idx", ov::element::i32, {3}, 0); - FillTensor("dst_idx", ov::element::i32, {1, 32, 3, 96}, 0); + FillTensor("beam_idx", ov::element::i32, {0}, 0); + FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0); } // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. @@ -537,16 +537,11 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic << src_indices.size() << " index pairs"; // set beam_idx and dst_idx based on provided values - if (beam_idx_val.size() == 0) { - for (int i = 0; i < 3; ++i) { - beam_idx_val.emplace_back(src_indices[i]); - dst_idx_val.emplace_back(dst_indices[i]); - } - } else { - for (int i = 0; i < 3; ++i) { - beam_idx_val[i] = src_indices[i]; - dst_idx_val[i] = dst_indices[i]; - } + beam_idx_val.clear(); + dst_idx_val.clear(); + for (int i = 0; i < src_indices.size(); ++i) { + beam_idx_val.emplace_back(src_indices[i]); + dst_idx_val.emplace_back(dst_indices[i]); } /* // Retrieve KVCache states and reorder them based on the provided indices diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index a3555201f3939..73231024a67a2 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -90,11 +90,9 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates); - auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0]; auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape(); - update_shape[2] = 3; - auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({3})); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); beam_idx->set_friendly_name("beam_idx"); beam_idx->output(0).get_tensor().add_names({"beam_idx"}); ov_model->add_parameters({beam_idx}); From 8dea771c48b4f52b1a37c297988a5a374398aac4 Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Thu, 20 Nov 2025 09:14:57 -0800 Subject: [PATCH 04/12] Make changes to support new KVCache fusion --- .../core/providers/openvino/ov_interface.cc | 18 ++++++++++-------- .../core/providers/openvino/ov_interface.h | 2 +- .../openvino/ov_stateful_patch_utils.cc | 18 +++++++++++++++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 2df8f8b412a91..1bd5f74846a56 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -467,12 +467,14 @@ std::optional StatefulOVInferRequest::FindTensor(const std::string& void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - if (beam_idx_val.size() > 0) { - ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {beam_idx_val.size()}); - for (int i = 0; i < beam_idx_val.size(); ++i) { - beam_idx_tensor.data()[i] = int32_t(beam_idx_val[i]); + FillTensor("beam_idx", ov::element::i32, {1}, 0); + + if (src_idx_val.size() > 0) { + ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {src_idx_val.size()}); + for (int i = 0; i < src_idx_val.size(); ++i) { + src_idx_tensor.data()[i] = int32_t(src_idx_val[i]); } - ovInfReq.set_tensor("beam_idx", beam_idx_tensor); + ovInfReq.set_tensor("src_idx", src_idx_tensor); ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96}); for (int i = 0; i < dst_idx_val.size(); ++i) { for (int j = 0; j < 32; ++j) { @@ -483,7 +485,7 @@ void StatefulOVInferRequest::PreProcessInferRequest() { } ovInfReq.set_tensor("dst_idx", dst_idx_tensor); } else { - FillTensor("beam_idx", ov::element::i32, {0}, 0); + FillTensor("src_idx", ov::element::i32, {0}, 0); FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0); } @@ -537,10 +539,10 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic << src_indices.size() << " index pairs"; // set beam_idx and dst_idx based on provided values - beam_idx_val.clear(); + src_idx_val.clear(); dst_idx_val.clear(); for (int i = 0; i < src_indices.size(); ++i) { - beam_idx_val.emplace_back(src_indices[i]); + src_idx_val.emplace_back(src_indices[i]); dst_idx_val.emplace_back(dst_indices[i]); } /* diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 8073157e4450e..8f3d1cd38a2b6 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -160,7 +160,7 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; - std::vector beam_idx_val; + std::vector src_idx_val; std::vector dst_idx_val; bool IsNPULogitsSliceRequired(); diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index 73231024a67a2..f9b055028bbb5 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -90,14 +90,21 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates); + auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0]; auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape(); - auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({std::move(input_batch)})); beam_idx->set_friendly_name("beam_idx"); beam_idx->output(0).get_tensor().add_names({"beam_idx"}); ov_model->add_parameters({beam_idx}); not_kv_inputs.push_back(beam_idx->get_friendly_name()); + auto src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); + src_idx->set_friendly_name("src_idx"); + src_idx->output(0).get_tensor().add_names({"src_idx"}); + ov_model->add_parameters({src_idx}); + not_kv_inputs.push_back(src_idx->get_friendly_name()); + auto dst_idx = std::make_shared(ov::element::i32, update_shape); dst_idx->set_friendly_name("dst_idx"); dst_idx->output(0).get_tensor().add_names({"dst_idx"}); @@ -112,10 +119,15 @@ void FuseCacheReorder(std::shared_ptr ov_model, auto gather_op = std::make_shared(parameter_output_port, beam_idx, + ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); + + auto update_gather_op = + std::make_shared(gather_op, + src_idx, ov::opset13::Constant::create(ov::element::i64, {}, {2})); - auto update_op = std::make_shared(parameter_output_port, - dst_idx, gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); + auto update_op = std::make_shared(gather_op, + dst_idx, update_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); // Replace the source output for all consumers of the input tensor for (auto& consumer : consumers) { From 6438df8bef1f1599c240323fb06e61fa3c719fed Mon Sep 17 00:00:00 2001 From: "Dvoretckii, Mikhail" Date: Tue, 2 Dec 2025 03:44:43 -0800 Subject: [PATCH 05/12] Add proper include --- onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index 11b0cc1dbe9bb..76a3065910ee7 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -13,6 +13,7 @@ #include "openvino/pass/manager.hpp" #include "openvino/pass/make_stateful.hpp" +#include "openvino/opsets/opset3.hpp" #include "openvino/opsets/opset12.hpp" #include "openvino/opsets/opset13.hpp" From c76fb92587584777a45d7ba6c8cf14a944a0c372 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Wed, 15 Oct 2025 18:39:00 -0700 Subject: [PATCH 06/12] add reorder KV cache API --- .../providers/openvino/backend_manager.cc | 6 ++ .../core/providers/openvino/backend_manager.h | 1 + .../openvino/backends/basic_backend.cc | 6 ++ .../openvino/backends/basic_backend.h | 1 + .../core/providers/openvino/ibackend.h | 1 + .../openvino/openvino_execution_provider.cc | 62 +++++++++++++++++++ .../core/providers/openvino/ov_interface.h | 2 + 7 files changed, 79 insertions(+) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index fa23f6969b633..d6d098d66242c 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -781,5 +781,11 @@ void BackendManager::RewindKVCache(size_t index) { } } +void BackendManager::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { + if (concrete_backend_) { + concrete_backend_->ReorderKVCache(src_indices, dst_indices); + } +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 9f560340a2033..62cc7d95a4ef9 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -31,6 +31,7 @@ class BackendManager { void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data); ov::CompiledModel GetOVCompiledModel(); void RewindKVCache(size_t index); + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices); private: std::unique_ptr GetModelProtoFromFusedNode( diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 7c3ee7e76c3f9..9f85d42821230 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -315,6 +315,12 @@ void BasicBackend::RewindKVCache(size_t index) { }); } +void BasicBackend::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { + infer_req_pool_->forEachIdleRequest([&](OVInferRequestPtr& infer_request) { + infer_request->ReorderKVCache(src_indices, dst_indices); + }); +} + void BasicBackend::Infer(OrtKernelContext* ctx) const { Ort::KernelContext context(ctx); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 7639e024c52cb..d2b57fdfbac84 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -137,6 +137,7 @@ class BasicBackend : public IBackend { return exe_network_.Get(); } void RewindKVCache(size_t index) override; + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; private: bool ValidateSubgraph(std::map>& const_outputs_map); diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 365a4625815d6..672fdbc218a78 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -18,6 +18,7 @@ class IBackend { virtual ov::CompiledModel GetOVCompiledModel() = 0; virtual ~IBackend() = default; virtual void RewindKVCache(size_t index) {} + virtual void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) {} }; using ptr_stream_t = std::unique_ptr; class BackendFactory { diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index a099f85b2a4b9..b1ab21d826a2c 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -286,6 +286,68 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span src_indices; + std::vector dst_indices; + + try { + // Parse source indices from comma-separated string + std::stringstream src_stream(src_string); + std::string src_token; + while (std::getline(src_stream, src_token, ',')) { + // Trim whitespace + src_token.erase(0, src_token.find_first_not_of(" \t")); + src_token.erase(src_token.find_last_not_of(" \t") + 1); + + if (!src_token.empty()) { + int64_t index = std::stoll(src_token); + if (index >= 0) { + src_indices.push_back(static_cast(index)); + } else { + LOGS_DEFAULT(WARNING) << "kvcache_reorder src_index is < 0: " << index; + } + } + } + + // Parse destination indices from comma-separated string + std::stringstream dst_stream(dst_string); + std::string dst_token; + while (std::getline(dst_stream, dst_token, ',')) { + // Trim whitespace + dst_token.erase(0, dst_token.find_first_not_of(" \t")); + dst_token.erase(dst_token.find_last_not_of(" \t") + 1); + + if (!dst_token.empty()) { + int64_t index = std::stoll(dst_token); + if (index >= 0) { + dst_indices.push_back(static_cast(index)); + } else { + LOGS_DEFAULT(WARNING) << "kvcache_reorder dst_index is < 0: " << index; + } + } + } + + } catch (const std::exception& e) { + LOGS_DEFAULT(WARNING) << "Conversion for kvcache_reorder string value to int64_t indices failed. " + << "Exception: " << e.what(); + return Status::OK(); + } + + // Trigger KVCache Reorder for target Backend with vector arguments + for (auto& backend : backend_managers_) { + backend.ReorderKVCache(src_indices, dst_indices); + } } else { // Handle unknown options LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value; diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 8f3d1cd38a2b6..6a001533ff26c 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -136,6 +136,7 @@ class OVInferRequest { return ovInfReq; } virtual void RewindKVCache([[maybe_unused]] size_t index) {} + virtual void ReorderKVCache([[maybe_unused]] const std::vector& src_indices, [[maybe_unused]] const std::vector& dst_indices) {} }; class StatefulOVInferRequest : public OVInferRequest { @@ -144,6 +145,7 @@ class StatefulOVInferRequest : public OVInferRequest { void Infer() override; void RewindKVCache(size_t index) override; + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; void FillTensor(const std::string& tensor_name, const ov::element::Type& type, const std::vector& shape, int32_t fill_value); void CacheTensor(const std::string& tensor_name, std::vector& cache); From 13a9f01c2c9798ef85293070935d58e0eb528ff3 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Fri, 5 Dec 2025 15:02:51 -0800 Subject: [PATCH 07/12] clean up code --- .../core/providers/openvino/ov_interface.cc | 35 ++++++------------- .../core/providers/openvino/ov_interface.h | 4 +-- .../openvino/ov_stateful_patch_utils.cc | 8 ++--- .../openvino/ov_stateful_patch_utils.h | 1 - 4 files changed, 17 insertions(+), 31 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 1bd5f74846a56..b6afda190c444 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -469,17 +469,17 @@ void StatefulOVInferRequest::PreProcessInferRequest() { // TODO(ankit): Address this issue and implement the fix at the appropriate layer. FillTensor("beam_idx", ov::element::i32, {1}, 0); - if (src_idx_val.size() > 0) { - ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {src_idx_val.size()}); - for (int i = 0; i < src_idx_val.size(); ++i) { - src_idx_tensor.data()[i] = int32_t(src_idx_val[i]); + if (kv_src_indices.size() > 0) { + ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); + for (int i = 0; i < kv_src_indices.size(); ++i) { + src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); } ovInfReq.set_tensor("src_idx", src_idx_tensor); - ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96}); - for (int i = 0; i < dst_idx_val.size(); ++i) { + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, kv_dst_indices.size(), 96}); + for (int i = 0; i < kv_dst_indices.size(); ++i) { for (int j = 0; j < 32; ++j) { for (int k = 0; k < 96; ++k) { - dst_idx_tensor.data()[(j * dst_idx_val.size() + i) * 96 + k] = int32_t(dst_idx_val[i]); + dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * 96 + k] = int32_t(kv_dst_indices[i]); } } } @@ -538,25 +538,12 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with " << src_indices.size() << " index pairs"; - // set beam_idx and dst_idx based on provided values - src_idx_val.clear(); - dst_idx_val.clear(); + kv_src_indices.clear(); + kv_dst_indices.clear(); for (int i = 0; i < src_indices.size(); ++i) { - src_idx_val.emplace_back(src_indices[i]); - dst_idx_val.emplace_back(dst_indices[i]); + kv_src_indices.emplace_back(src_indices[i]); + kv_dst_indices.emplace_back(dst_indices[i]); } - /* - // Retrieve KVCache states and reorder them based on the provided indices - auto states = ovInfReq.query_state(); - - for (auto& state : states) { - auto start_time = std::chrono::high_resolution_clock::now(); - state.gather_by_axis(src_indices, dst_indices); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time).count(); - LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds"; - } - */ } void StatefulOVInferRequest::RewindKVCache(size_t index) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 6a001533ff26c..4018aedea3094 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -162,8 +162,8 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; - std::vector src_idx_val; - std::vector dst_idx_val; + std::vector kv_src_indices; + std::vector kv_dst_indices; bool IsNPULogitsSliceRequired(); bool _npu_logits_slice_required = false; diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index f9b055028bbb5..7f2aa4a5cfa3f 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -121,17 +121,17 @@ void FuseCacheReorder(std::shared_ptr ov_model, beam_idx, ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); - auto update_gather_op = + auto updatekv_gather_op = std::make_shared(gather_op, src_idx, ov::opset13::Constant::create(ov::element::i64, {}, {2})); - auto update_op = std::make_shared(gather_op, - dst_idx, update_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); + auto updatekv_op = std::make_shared(gather_op, + dst_idx, updatekv_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); // Replace the source output for all consumers of the input tensor for (auto& consumer : consumers) { - consumer.replace_source_output(update_op->output(0)); + consumer.replace_source_output(updatekv_op->output(0)); } } diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index 76a3065910ee7..11b0cc1dbe9bb 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -13,7 +13,6 @@ #include "openvino/pass/manager.hpp" #include "openvino/pass/make_stateful.hpp" -#include "openvino/opsets/opset3.hpp" #include "openvino/opsets/opset12.hpp" #include "openvino/opsets/opset13.hpp" From 101102db4779d519defe34487711de0366604011 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Mon, 8 Dec 2025 21:29:45 -0800 Subject: [PATCH 08/12] add post process for internal handled inputs --- onnxruntime/core/providers/openvino/ov_interface.cc | 6 ++++++ onnxruntime/core/providers/openvino/ov_interface.h | 1 + 2 files changed, 7 insertions(+) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index b6afda190c444..cb93d54edbbb2 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -525,6 +525,12 @@ void StatefulOVInferRequest::PreProcessInferRequest() { void StatefulOVInferRequest::Infer() { PreProcessInferRequest(); OVInferRequest::Infer(); + PostProcessInferRequest(); +} + +void StatefulOVInferRequest::PostProcessInferRequest() { + kv_src_indices.clear(); + kv_dst_indices.clear(); } void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 4018aedea3094..a352456f4ac41 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -155,6 +155,7 @@ class StatefulOVInferRequest : public OVInferRequest { private: void PreProcessInferRequest(); + void PostProcessInferRequest(); std::string target_device; // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors, From 59bd56ebd00112a3722849650a6a597d13749b06 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Thu, 11 Dec 2025 20:53:40 -0800 Subject: [PATCH 09/12] disable update_kvcache for npu + pass kv info --- .../openvino/openvino_execution_provider.cc | 2 +- .../core/providers/openvino/ov_interface.cc | 17 ++--- .../openvino/ov_stateful_patch_utils.cc | 62 ++++++++++++------- .../openvino/ov_stateful_patch_utils.h | 5 +- 4 files changed, 54 insertions(+), 32 deletions(-) diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index b1ab21d826a2c..00a04aac11ab1 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -287,7 +287,7 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span& model, LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); if (!model_status) { LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; - PatchStatefulDecoder(model); + PatchStatefulDecoder(model, hw_target); } if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { @@ -468,25 +468,28 @@ void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. FillTensor("beam_idx", ov::element::i32, {1}, 0); - + ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape(); + uint64_t kv_num_heads = dst_idx_shape[1]; + uint64_t kv_head_size = dst_idx_shape[3]; if (kv_src_indices.size() > 0) { ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); for (int i = 0; i < kv_src_indices.size(); ++i) { src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); } ovInfReq.set_tensor("src_idx", src_idx_tensor); - ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, kv_dst_indices.size(), 96}); + + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); for (int i = 0; i < kv_dst_indices.size(); ++i) { - for (int j = 0; j < 32; ++j) { - for (int k = 0; k < 96; ++k) { - dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * 96 + k] = int32_t(kv_dst_indices[i]); + for (int j = 0; j < kv_num_heads; ++j) { + for (int k = 0; k < kv_head_size; ++k) { + dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); } } } ovInfReq.set_tensor("dst_idx", dst_idx_tensor); } else { FillTensor("src_idx", ov::element::i32, {0}, 0); - FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0); + FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0); } // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index 7f2aa4a5cfa3f..31b6c364f7b89 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -75,11 +75,16 @@ std::string GetInputOutputName(std::shared_ptr ov_model, void FuseCacheReorder(std::shared_ptr ov_model, std::vector& not_kv_inputs, const std::vector& key_value_input_names, - int gather_dim) { + int gather_dim, + const std::string& device) { if (ModelHasInputOutputNames(ov_model, "beam_idx")) { throw std::runtime_error("Model already has fused cache"); } + // Flag to add Gather+ScatterElementsUpdate subgraph for LLM speculative decoding + // TO-DO: extend to NPU device when OpenVINO NPU has related optimization + bool is_support_speculative_LLM = device.find("GPU") != std::string::npos; + // Define input name candidates in priority order const std::vector input_name_candidates = { "inputs_embeds", // Default fallback @@ -99,17 +104,22 @@ void FuseCacheReorder(std::shared_ptr ov_model, ov_model->add_parameters({beam_idx}); not_kv_inputs.push_back(beam_idx->get_friendly_name()); - auto src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); - src_idx->set_friendly_name("src_idx"); - src_idx->output(0).get_tensor().add_names({"src_idx"}); - ov_model->add_parameters({src_idx}); - not_kv_inputs.push_back(src_idx->get_friendly_name()); - - auto dst_idx = std::make_shared(ov::element::i32, update_shape); - dst_idx->set_friendly_name("dst_idx"); - dst_idx->output(0).get_tensor().add_names({"dst_idx"}); - ov_model->add_parameters({dst_idx}); - not_kv_inputs.push_back(dst_idx->get_friendly_name()); + std::shared_ptr src_idx; + std::shared_ptr dst_idx; + + if (is_support_speculative_LLM) { + src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); + src_idx->set_friendly_name("src_idx"); + src_idx->output(0).get_tensor().add_names({"src_idx"}); + ov_model->add_parameters({src_idx}); + not_kv_inputs.push_back(src_idx->get_friendly_name()); + + dst_idx = std::make_shared(ov::element::i32, update_shape); + dst_idx->set_friendly_name("dst_idx"); + dst_idx->output(0).get_tensor().add_names({"dst_idx"}); + ov_model->add_parameters({dst_idx}); + not_kv_inputs.push_back(dst_idx->get_friendly_name()); + } // Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx for (const auto& input_name : key_value_input_names) { @@ -121,17 +131,25 @@ void FuseCacheReorder(std::shared_ptr ov_model, beam_idx, ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); - auto updatekv_gather_op = - std::make_shared(gather_op, - src_idx, - ov::opset13::Constant::create(ov::element::i64, {}, {2})); - - auto updatekv_op = std::make_shared(gather_op, - dst_idx, updatekv_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2})); + std::shared_ptr output_node; + if (is_support_speculative_LLM) { + auto updatekv_gather_op = + std::make_shared(gather_op, + src_idx, + ov::opset13::Constant::create(ov::element::i64, {}, {2})); + + auto updatekv_op = std::make_shared(gather_op, + dst_idx, + updatekv_gather_op, + ov::opset13::Constant::create(ov::element::i64, {}, {2})); + output_node = updatekv_op; + } else { + output_node = gather_op; + } // Replace the source output for all consumers of the input tensor for (auto& consumer : consumers) { - consumer.replace_source_output(updatekv_op->output(0)); + consumer.replace_source_output(output_node->output(0)); } } @@ -268,7 +286,7 @@ std::pair, std::vector> ExtractInputKVTens } // Updated PatchStatefulDecoder function -void PatchStatefulDecoder(std::shared_ptr model) { +void PatchStatefulDecoder(std::shared_ptr model, const std::string& device) { // Use the dynamic pattern-based extraction logic auto [key_value_output_names, extracted_patterns] = ExtractKVPatternsFromOutputs(model); auto [key_value_input_names, not_kv_inputs] = ExtractInputKVTensors(model, extracted_patterns); @@ -290,7 +308,7 @@ void PatchStatefulDecoder(std::shared_ptr model) { // batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0 auto batch_dim = 0; - FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim); + FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim, device); MakeStateful(model, key_value_input_names, key_value_output_names); } diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index 11b0cc1dbe9bb..ce7db01063426 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -26,13 +26,14 @@ bool ModelHasInputOutputNames(std::shared_ptr model, const std::strin void FuseCacheReorder(std::shared_ptr ov_model, std::vector& not_kv_inputs, const std::vector& key_value_input_names, - int gather_dim); + int gather_dim, + const std::string& device = ""); void MakeStateful(std::shared_ptr& ov_model, const std::vector& key_value_input_names, const std::vector& key_value_output_names); -void PatchStatefulDecoder(std::shared_ptr model); +void PatchStatefulDecoder(std::shared_ptr model, const std::string& device = ""); bool HasOpWithType(const std::shared_ptr& function, const std::string& type_name); From 7ea12e74aa860d62cf7d11a78f00c72c3f86e210 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Thu, 11 Dec 2025 22:08:58 -0800 Subject: [PATCH 10/12] refactor code --- .../openvino/openvino_execution_provider.cc | 75 +++++++++---------- .../core/providers/openvino/ov_interface.cc | 8 +- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 00a04aac11ab1..cc22fddab10f6 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -291,57 +291,52 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span src_indices; - std::vector dst_indices; + auto parse_indices = [](const std::string& input, const std::string& index_type) -> std::pair> { + std::vector indices; + std::stringstream stream(input); + std::string token; - try { - // Parse source indices from comma-separated string - std::stringstream src_stream(src_string); - std::string src_token; - while (std::getline(src_stream, src_token, ',')) { - // Trim whitespace - src_token.erase(0, src_token.find_first_not_of(" \t")); - src_token.erase(src_token.find_last_not_of(" \t") + 1); - - if (!src_token.empty()) { - int64_t index = std::stoll(src_token); - if (index >= 0) { - src_indices.push_back(static_cast(index)); - } else { - LOGS_DEFAULT(WARNING) << "kvcache_reorder src_index is < 0: " << index; + try { + while (std::getline(stream, token, ',')) { + // Trim whitespace + token.erase(0, token.find_first_not_of(" \t")); + token.erase(token.find_last_not_of(" \t") + 1); + + if (!token.empty()) { + int64_t index = std::stoll(token); + if (index >= 0) { + indices.push_back(static_cast(index)); + } else { + return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "kvcache_reorder " + index_type + " cannot be negative: " + std::to_string(index)), + std::vector()}; + } } } + } catch (const std::exception& e) { + return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "Failed to parse kvcache_reorder " + index_type + ": " + std::string(e.what())), + std::vector()}; } - // Parse destination indices from comma-separated string - std::stringstream dst_stream(dst_string); - std::string dst_token; - while (std::getline(dst_stream, dst_token, ',')) { - // Trim whitespace - dst_token.erase(0, dst_token.find_first_not_of(" \t")); - dst_token.erase(dst_token.find_last_not_of(" \t") + 1); - - if (!dst_token.empty()) { - int64_t index = std::stoll(dst_token); - if (index >= 0) { - dst_indices.push_back(static_cast(index)); - } else { - LOGS_DEFAULT(WARNING) << "kvcache_reorder dst_index is < 0: " << index; - } - } - } + return {Status::OK(), std::move(indices)}; + }; - } catch (const std::exception& e) { - LOGS_DEFAULT(WARNING) << "Conversion for kvcache_reorder string value to int64_t indices failed. " - << "Exception: " << e.what(); - return Status::OK(); + auto [src_status, src_indices] = parse_indices(src_string, "src_index"); + if (!src_status.IsOK()) { + return src_status; + } + + auto [dst_status, dst_indices] = parse_indices(dst_string, "dst_index"); + if (!dst_status.IsOK()) { + return dst_status; } // Trigger KVCache Reorder for target Backend with vector arguments diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index b9c9fd0738cdb..db8603f0fab47 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -473,15 +473,15 @@ void StatefulOVInferRequest::PreProcessInferRequest() { uint64_t kv_head_size = dst_idx_shape[3]; if (kv_src_indices.size() > 0) { ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); - for (int i = 0; i < kv_src_indices.size(); ++i) { + for (auto i = 0; i < kv_src_indices.size(); ++i) { src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); } ovInfReq.set_tensor("src_idx", src_idx_tensor); ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); - for (int i = 0; i < kv_dst_indices.size(); ++i) { - for (int j = 0; j < kv_num_heads; ++j) { - for (int k = 0; k < kv_head_size; ++k) { + for (auto i = 0; i < kv_dst_indices.size(); ++i) { + for (auto j = 0; j < kv_num_heads; ++j) { + for (auto k = 0; k < kv_head_size; ++k) { dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); } } From ab95e39f9b84dfc2a1643ff12063717bb76677f1 Mon Sep 17 00:00:00 2001 From: Kotomi-Du Date: Thu, 11 Dec 2025 22:33:46 -0800 Subject: [PATCH 11/12] minor change --- .../core/providers/openvino/ov_interface.cc | 48 +++++++++++-------- .../core/providers/openvino/ov_interface.h | 2 + .../openvino/ov_stateful_patch_utils.cc | 8 ++-- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index db8603f0fab47..be008e6d5617e 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -380,6 +380,7 @@ void OVInferRequest::Infer() { StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) : OVInferRequest(std::move(infer_request)), target_device(device) { bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); + is_support_kvcache_reorder = device.find("GPU") != std::string::npos; _npu_logits_slice_required = IsNPULogitsSliceRequired(); @@ -468,28 +469,31 @@ void StatefulOVInferRequest::PreProcessInferRequest() { // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. // TODO(ankit): Address this issue and implement the fix at the appropriate layer. FillTensor("beam_idx", ov::element::i32, {1}, 0); - ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape(); - uint64_t kv_num_heads = dst_idx_shape[1]; - uint64_t kv_head_size = dst_idx_shape[3]; - if (kv_src_indices.size() > 0) { - ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); - for (auto i = 0; i < kv_src_indices.size(); ++i) { - src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); - } - ovInfReq.set_tensor("src_idx", src_idx_tensor); - ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); - for (auto i = 0; i < kv_dst_indices.size(); ++i) { - for (auto j = 0; j < kv_num_heads; ++j) { - for (auto k = 0; k < kv_head_size; ++k) { - dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); + if (is_support_kvcache_reorder){ + ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape(); + uint64_t kv_num_heads = dst_idx_shape[1]; + uint64_t kv_head_size = dst_idx_shape[3]; + if (kv_src_indices.size() > 0) { + ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); + for (auto i = 0; i < kv_src_indices.size(); ++i) { + src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); + } + ovInfReq.set_tensor("src_idx", src_idx_tensor); + + ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); + for (auto i = 0; i < kv_dst_indices.size(); ++i) { + for (auto j = 0; j < kv_num_heads; ++j) { + for (auto k = 0; k < kv_head_size; ++k) { + dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); + } + } } + ovInfReq.set_tensor("dst_idx", dst_idx_tensor); + } else { + FillTensor("src_idx", ov::element::i32, {0}, 0); + FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0); } - } - ovInfReq.set_tensor("dst_idx", dst_idx_tensor); - } else { - FillTensor("src_idx", ov::element::i32, {0}, 0); - FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0); } // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. @@ -532,8 +536,10 @@ void StatefulOVInferRequest::Infer() { } void StatefulOVInferRequest::PostProcessInferRequest() { - kv_src_indices.clear(); - kv_dst_indices.clear(); + if(is_support_kvcache_reorder){ + kv_src_indices.clear(); + kv_dst_indices.clear(); + } } void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index a352456f4ac41..f2de48cfe35fd 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -163,6 +163,8 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; + + bool is_support_kvcache_reorder = false; std::vector kv_src_indices; std::vector kv_dst_indices; diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index 31b6c364f7b89..0b4f828c7cbb6 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -81,9 +81,9 @@ void FuseCacheReorder(std::shared_ptr ov_model, throw std::runtime_error("Model already has fused cache"); } - // Flag to add Gather+ScatterElementsUpdate subgraph for LLM speculative decoding + // Flag to add Gather+ScatterElementsUpdate subgraph to reorder KV cache for LLM speculative decoding // TO-DO: extend to NPU device when OpenVINO NPU has related optimization - bool is_support_speculative_LLM = device.find("GPU") != std::string::npos; + bool is_support_kvcache_reorder = device.find("GPU") != std::string::npos; // Define input name candidates in priority order const std::vector input_name_candidates = { @@ -107,7 +107,7 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::shared_ptr src_idx; std::shared_ptr dst_idx; - if (is_support_speculative_LLM) { + if (is_support_kvcache_reorder) { src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); src_idx->set_friendly_name("src_idx"); src_idx->output(0).get_tensor().add_names({"src_idx"}); @@ -132,7 +132,7 @@ void FuseCacheReorder(std::shared_ptr ov_model, ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); std::shared_ptr output_node; - if (is_support_speculative_LLM) { + if (is_support_kvcache_reorder) { auto updatekv_gather_op = std::make_shared(gather_op, src_idx, From 166586572c93014da65581b09dda2de6bf9efe97 Mon Sep 17 00:00:00 2001 From: czekun Date: Tue, 16 Dec 2025 21:48:53 -0800 Subject: [PATCH 12/12] refactor with int32 indices, string_view parsing. move fuse flag in exenetwork. --- .../providers/openvino/backend_manager.cc | 2 +- .../core/providers/openvino/backend_manager.h | 2 +- .../openvino/backends/basic_backend.cc | 2 +- .../openvino/backends/basic_backend.h | 2 +- .../core/providers/openvino/ibackend.h | 2 +- .../openvino/openvino_execution_provider.cc | 76 ++++++++++--------- .../core/providers/openvino/ov_interface.cc | 47 ++++++------ .../core/providers/openvino/ov_interface.h | 24 +++--- .../openvino/ov_stateful_patch_utils.cc | 20 ++--- .../openvino/ov_stateful_patch_utils.h | 4 +- 10 files changed, 90 insertions(+), 91 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index d6d098d66242c..5e80ee3738ed8 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -781,7 +781,7 @@ void BackendManager::RewindKVCache(size_t index) { } } -void BackendManager::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { +void BackendManager::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { if (concrete_backend_) { concrete_backend_->ReorderKVCache(src_indices, dst_indices); } diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 62cc7d95a4ef9..f8a74b9cbcfa4 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -31,7 +31,7 @@ class BackendManager { void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data); ov::CompiledModel GetOVCompiledModel(); void RewindKVCache(size_t index); - void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices); + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices); private: std::unique_ptr GetModelProtoFromFusedNode( diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 9f85d42821230..7f4d1f74cfb7b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -315,7 +315,7 @@ void BasicBackend::RewindKVCache(size_t index) { }); } -void BasicBackend::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { +void BasicBackend::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { infer_req_pool_->forEachIdleRequest([&](OVInferRequestPtr& infer_request) { infer_request->ReorderKVCache(src_indices, dst_indices); }); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index d2b57fdfbac84..c7505d59eec0c 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -137,7 +137,7 @@ class BasicBackend : public IBackend { return exe_network_.Get(); } void RewindKVCache(size_t index) override; - void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; private: bool ValidateSubgraph(std::map>& const_outputs_map); diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 672fdbc218a78..4444f37ac7433 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -18,7 +18,7 @@ class IBackend { virtual ov::CompiledModel GetOVCompiledModel() = 0; virtual ~IBackend() = default; virtual void RewindKVCache(size_t index) {} - virtual void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) {} + virtual void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) {} }; using ptr_stream_t = std::unique_ptr; class BackendFactory { diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index cc22fddab10f6..b7b0894d7bff7 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/openvino_execution_provider.h" #include "core/providers/openvino/contexts.h" @@ -295,53 +296,54 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span std::pair> { - std::vector indices; - std::stringstream stream(input); - std::string token; - - try { - while (std::getline(stream, token, ',')) { - // Trim whitespace - token.erase(0, token.find_first_not_of(" \t")); - token.erase(token.find_last_not_of(" \t") + 1); - - if (!token.empty()) { - int64_t index = std::stoll(token); - if (index >= 0) { - indices.push_back(static_cast(index)); - } else { - return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "kvcache_reorder " + index_type + " cannot be negative: " + std::to_string(index)), - std::vector()}; - } - } + std::string_view src_string(value.begin(), value.begin() + delimiter_pos); + std::string_view dst_string(value.begin() + delimiter_pos + 1, value.end()); + + constexpr auto parse_indices = [](std::string_view input, const std::string& index_type) -> std::variant> { + std::vector indices; + while (!input.empty()) { + const auto delimiter_pos = input.find(','); + const auto part = input.substr(0, delimiter_pos); + errno = 0; + char* parse_end = nullptr; + // strtoll/stoll already skips whitespaces + const auto index = std::strtol(part.data(), &parse_end, 10); + if (parse_end == part.data()) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "Failed to parse kvcache_reorder " + index_type + ": " + std::string(part)); + } + if (index < 0) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "kvcache_reorder " + index_type + " cannot be negative: " + std::string(part)); + } + if (errno == ERANGE) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "kvcache_reorder " + index_type + " exceed INT32_MAX: " + std::string(part)); + } + indices.push_back(static_cast(index)); + if (delimiter_pos != std::string_view::npos) { + // ignore any trailing chars after the number, can do futher checking if needed + input.remove_prefix(part.size() + 1); + } else { + break; } - } catch (const std::exception& e) { - return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "Failed to parse kvcache_reorder " + index_type + ": " + std::string(e.what())), - std::vector()}; } - - return {Status::OK(), std::move(indices)}; + return indices; }; - auto [src_status, src_indices] = parse_indices(src_string, "src_index"); - if (!src_status.IsOK()) { - return src_status; + const auto src_indices = parse_indices(src_string, "src_index"); + if (src_indices.index() == 0) { + return std::get<0>(src_indices); } - auto [dst_status, dst_indices] = parse_indices(dst_string, "dst_index"); - if (!dst_status.IsOK()) { - return dst_status; + const auto dst_indices = parse_indices(dst_string, "dst_index"); + if (dst_indices.index() == 0) { + return std::get<0>(dst_indices); } // Trigger KVCache Reorder for target Backend with vector arguments for (auto& backend : backend_managers_) { - backend.ReorderKVCache(src_indices, dst_indices); + backend.ReorderKVCache(std::get<1>(src_indices), std::get<1>(dst_indices)); } } else { // Handle unknown options diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index be008e6d5617e..5255729478dbe 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -109,9 +109,13 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr& model, bool model_status = IsStateful(model); LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); + // Flag to add Gather+ScatterElementsUpdate subgraph to reorder KV cache for LLM speculative decoding + bool is_fused_kvcache_reorder = false; if (!model_status) { LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; - PatchStatefulDecoder(model, hw_target); + // TO-DO: extend to NPU device when OpenVINO NPU has related optimization + is_fused_kvcache_reorder = hw_target.find("GPU") != std::string::npos; + PatchStatefulDecoder(model, is_fused_kvcache_reorder); } if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { @@ -152,7 +156,7 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr& model, LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow"; compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config); - OVExeNetwork exe(compiled_model, hw_target, true); + OVExeNetwork exe(compiled_model, hw_target, true, is_fused_kvcache_reorder); return exe; } @@ -332,7 +336,7 @@ std::shared_ptr OVExeNetwork::CreateInferRequest() { auto infReq = compiled_model_obj.create_infer_request(); std::shared_ptr ovInfReq; if (is_stateful_causallm) { - ovInfReq = std::make_shared(std::move(infReq), target_device); + ovInfReq = std::make_shared(std::move(infReq), target_device, is_fused_kvcache_reorder); } else { ovInfReq = std::make_shared(std::move(infReq)); } @@ -377,10 +381,9 @@ void OVInferRequest::Infer() { "In Error Couldn't start Inference"); } -StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) - : OVInferRequest(std::move(infer_request)), target_device(device) { +StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device, bool fused_kvcache_reorder) + : OVInferRequest(std::move(infer_request)), target_device(device), is_fused_kvcache_reorder(fused_kvcache_reorder) { bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); - is_support_kvcache_reorder = device.find("GPU") != std::string::npos; _npu_logits_slice_required = IsNPULogitsSliceRequired(); @@ -470,23 +473,23 @@ void StatefulOVInferRequest::PreProcessInferRequest() { // TODO(ankit): Address this issue and implement the fix at the appropriate layer. FillTensor("beam_idx", ov::element::i32, {1}, 0); - if (is_support_kvcache_reorder){ + if (is_fused_kvcache_reorder){ ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape(); - uint64_t kv_num_heads = dst_idx_shape[1]; - uint64_t kv_head_size = dst_idx_shape[3]; + const auto kv_num_heads = dst_idx_shape[1]; + const auto kv_head_size = dst_idx_shape[3]; if (kv_src_indices.size() > 0) { ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()}); - for (auto i = 0; i < kv_src_indices.size(); ++i) { - src_idx_tensor.data()[i] = int32_t(kv_src_indices[i]); + const auto src_idx_ptr = src_idx_tensor.data(); + for (size_t i = 0; i < kv_src_indices.size(); ++i) { + src_idx_ptr[i] = static_cast(kv_src_indices[i]); } ovInfReq.set_tensor("src_idx", src_idx_tensor); ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size}); - for (auto i = 0; i < kv_dst_indices.size(); ++i) { - for (auto j = 0; j < kv_num_heads; ++j) { - for (auto k = 0; k < kv_head_size; ++k) { - dst_idx_tensor.data()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]); - } + const auto dst_idx_ptr = dst_idx_tensor.data(); + for (size_t i = 0; i < kv_num_heads; ++i) { + for (size_t j = 0; j < kv_dst_indices.size(); ++j) { + std::fill_n(dst_idx_ptr + (i * kv_dst_indices.size() + j) * kv_head_size, kv_head_size, kv_dst_indices[j]); } } ovInfReq.set_tensor("dst_idx", dst_idx_tensor); @@ -536,13 +539,13 @@ void StatefulOVInferRequest::Infer() { } void StatefulOVInferRequest::PostProcessInferRequest() { - if(is_support_kvcache_reorder){ + if(is_fused_kvcache_reorder){ kv_src_indices.clear(); kv_dst_indices.clear(); } } -void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { +void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) { // Validate input parameters if (src_indices.size() != dst_indices.size()) { ORT_THROW(log_tag + "ReorderKVCache: src_indices and dst_indices must have the same size. " @@ -553,12 +556,8 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector& src_indic LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with " << src_indices.size() << " index pairs"; - kv_src_indices.clear(); - kv_dst_indices.clear(); - for (int i = 0; i < src_indices.size(); ++i) { - kv_src_indices.emplace_back(src_indices[i]); - kv_dst_indices.emplace_back(dst_indices[i]); - } + kv_src_indices = src_indices; + kv_dst_indices = dst_indices; } void StatefulOVInferRequest::RewindKVCache(size_t index) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index f2de48cfe35fd..2b61a7d603be6 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -91,10 +91,11 @@ class OVExeNetwork { ov::CompiledModel compiled_model_obj; std::string target_device; bool is_stateful_causallm; + bool is_fused_kvcache_reorder = false; public: - explicit OVExeNetwork(ov::CompiledModel compiled_model, std::string device, bool stateful_causallm = false) - : compiled_model_obj(std::move(compiled_model)), target_device(std::move(device)), is_stateful_causallm(stateful_causallm) {} + explicit OVExeNetwork(ov::CompiledModel compiled_model, std::string device, bool stateful_causallm = false, bool fused_kvcache_reorder = false) + : compiled_model_obj(std::move(compiled_model)), target_device(std::move(device)), is_stateful_causallm(stateful_causallm), is_fused_kvcache_reorder(fused_kvcache_reorder) {} OVExeNetwork() : compiled_model_obj(ov::CompiledModel()), is_stateful_causallm(false) {} ov::CompiledModel& Get() { return compiled_model_obj; } std::shared_ptr CreateInferRequest(); @@ -136,16 +137,16 @@ class OVInferRequest { return ovInfReq; } virtual void RewindKVCache([[maybe_unused]] size_t index) {} - virtual void ReorderKVCache([[maybe_unused]] const std::vector& src_indices, [[maybe_unused]] const std::vector& dst_indices) {} + virtual void ReorderKVCache([[maybe_unused]] const std::vector& src_indices, [[maybe_unused]] const std::vector& dst_indices) {} }; class StatefulOVInferRequest : public OVInferRequest { public: - explicit StatefulOVInferRequest(ov::InferRequest infer_request, std::string device); + explicit StatefulOVInferRequest(ov::InferRequest infer_request, std::string device, bool fused_kvcache_reorder = false); void Infer() override; void RewindKVCache(size_t index) override; - void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; + void ReorderKVCache(const std::vector& src_indices, const std::vector& dst_indices) override; void FillTensor(const std::string& tensor_name, const ov::element::Type& type, const std::vector& shape, int32_t fill_value); void CacheTensor(const std::string& tensor_name, std::vector& cache); @@ -158,15 +159,16 @@ class StatefulOVInferRequest : public OVInferRequest { void PostProcessInferRequest(); std::string target_device; - // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors, - // and ensure that full chat history is passed for each prefill call. - bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; + std::vector kv_src_indices; + std::vector kv_dst_indices; - bool is_support_kvcache_reorder = false; - std::vector kv_src_indices; - std::vector kv_dst_indices; + // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors, + // and ensure that full chat history is passed for each prefill call. + bool prefill_use_full_chat_history = false; + // If fused_kvcache_reorder, will include kv_sec/dst_indices as input + bool is_fused_kvcache_reorder = false; bool IsNPULogitsSliceRequired(); bool _npu_logits_slice_required = false; diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc index 0b4f828c7cbb6..770c371c399b8 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -76,15 +76,11 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::vector& not_kv_inputs, const std::vector& key_value_input_names, int gather_dim, - const std::string& device) { + const bool is_fused_kvcache_reorder) { if (ModelHasInputOutputNames(ov_model, "beam_idx")) { throw std::runtime_error("Model already has fused cache"); } - // Flag to add Gather+ScatterElementsUpdate subgraph to reorder KV cache for LLM speculative decoding - // TO-DO: extend to NPU device when OpenVINO NPU has related optimization - bool is_support_kvcache_reorder = device.find("GPU") != std::string::npos; - // Define input name candidates in priority order const std::vector input_name_candidates = { "inputs_embeds", // Default fallback @@ -107,7 +103,7 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::shared_ptr src_idx; std::shared_ptr dst_idx; - if (is_support_kvcache_reorder) { + if (is_fused_kvcache_reorder) { src_idx = std::make_shared(ov::element::i32, ov::PartialShape({update_shape[2]})); src_idx->set_friendly_name("src_idx"); src_idx->output(0).get_tensor().add_names({"src_idx"}); @@ -132,16 +128,16 @@ void FuseCacheReorder(std::shared_ptr ov_model, ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); std::shared_ptr output_node; - if (is_support_kvcache_reorder) { + if (is_fused_kvcache_reorder) { auto updatekv_gather_op = std::make_shared(gather_op, src_idx, ov::opset13::Constant::create(ov::element::i64, {}, {2})); auto updatekv_op = std::make_shared(gather_op, - dst_idx, - updatekv_gather_op, - ov::opset13::Constant::create(ov::element::i64, {}, {2})); + dst_idx, + updatekv_gather_op, + ov::opset13::Constant::create(ov::element::i64, {}, {2})); output_node = updatekv_op; } else { output_node = gather_op; @@ -286,7 +282,7 @@ std::pair, std::vector> ExtractInputKVTens } // Updated PatchStatefulDecoder function -void PatchStatefulDecoder(std::shared_ptr model, const std::string& device) { +void PatchStatefulDecoder(std::shared_ptr model, const bool is_fused_kvcache_reorder) { // Use the dynamic pattern-based extraction logic auto [key_value_output_names, extracted_patterns] = ExtractKVPatternsFromOutputs(model); auto [key_value_input_names, not_kv_inputs] = ExtractInputKVTensors(model, extracted_patterns); @@ -308,7 +304,7 @@ void PatchStatefulDecoder(std::shared_ptr model, const std::string& d // batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0 auto batch_dim = 0; - FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim, device); + FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim, is_fused_kvcache_reorder); MakeStateful(model, key_value_input_names, key_value_output_names); } diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h index ce7db01063426..bfb6224fc8993 100644 --- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -27,13 +27,13 @@ void FuseCacheReorder(std::shared_ptr ov_model, std::vector& not_kv_inputs, const std::vector& key_value_input_names, int gather_dim, - const std::string& device = ""); + const bool is_fused_kvcache_reorder = false); void MakeStateful(std::shared_ptr& ov_model, const std::vector& key_value_input_names, const std::vector& key_value_output_names); -void PatchStatefulDecoder(std::shared_ptr model, const std::string& device = ""); +void PatchStatefulDecoder(std::shared_ptr model, const bool is_fused_kvcache_reorder = false); bool HasOpWithType(const std::shared_ptr& function, const std::string& type_name);