From e67a96b460674133f441dbcc78284a1fa79b3396 Mon Sep 17 00:00:00 2001
From: "Dvoretckii, Mikhail" <mikhail.dvoretckii@intel.com>
Date: Fri, 31 Oct 2025 03:21:39 -0700
Subject: [PATCH 01/12] Reorder KV cache using the new gather_by_axis API

---
 .../core/providers/openvino/ov_interface.cc   | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index a57db77c37dfa..9350d348450cf 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -507,6 +507,29 @@ void StatefulOVInferRequest::Infer() {
   OVInferRequest::Infer();
 }
 
+void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {
+  // Validate input parameters
+  if (src_indices.size() != dst_indices.size()) {
+    ORT_THROW(log_tag + "ReorderKVCache: src_indices and dst_indices must have the same size. "
+              "Got src_indices.size()=" + std::to_string(src_indices.size()) +
+              ", dst_indices.size()=" + std::to_string(dst_indices.size()));
+  }
+
+  LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with "
+                     << src_indices.size() << " index pairs";
+
+  // Retrieve KVCache states and reorder them based on the provided indices
+  auto states = ovInfReq.query_state();
+
+  for (auto& state : states) {
+    auto start_time = std::chrono::high_resolution_clock::now();
+    state.gather_by_axis(src_indices, dst_indices);
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+    LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds";
+  }
+}
+
 void StatefulOVInferRequest::RewindKVCache(size_t index) {
   LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index;
 

From 349eff9a7e20388da854fec73bc09e9627fa5833 Mon Sep 17 00:00:00 2001
From: "Dvoretckii, Mikhail" <mikhail.dvoretckii@intel.com>
Date: Thu, 13 Nov 2025 06:07:10 -0800
Subject: [PATCH 02/12] Do a ScatterElementsUpdate-based reorder during
 execution

---
 .../core/providers/openvino/ov_interface.cc   | 34 ++++++++++++++++++-
 .../core/providers/openvino/ov_interface.h    |  2 ++
 .../openvino/ov_stateful_patch_utils.cc       | 17 ++++++++--
 .../openvino/ov_stateful_patch_utils.h        |  1 +
 4 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 9350d348450cf..0090783d1e088 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -467,7 +467,25 @@ std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string&
 void StatefulOVInferRequest::PreProcessInferRequest() {
   // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
   // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
-  FillTensor("beam_idx", ov::element::i32, {1}, 0);
+  if (beam_idx_val.size() == 3) {
+    ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {3});
+    for (int i = 0; i < 3; ++i) {
+      beam_idx_tensor.data<int32_t>()[i] = int32_t(beam_idx_val[i]);
+    }
+    ovInfReq.set_tensor("beam_idx", beam_idx_tensor);
+    ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, 3, 96});
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 32; ++j) {
+        for (int k = 0; k < 96; ++k) {
+          dst_idx_tensor.data<int32_t>()[(j * 3 + i) * 96 + k] = int32_t(dst_idx_val[i]);
+        }
+      }
+    }
+    ovInfReq.set_tensor("dst_idx", dst_idx_tensor);
+  } else {
+    FillTensor("beam_idx", ov::element::i32, {3}, 0);
+    FillTensor("dst_idx", ov::element::i32, {1, 32, 3, 96}, 0);
+  }
 
   // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
   if (prefill_use_full_chat_history) {
@@ -518,6 +536,19 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indic
   LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with "
                      << src_indices.size() << " index pairs";
 
+  // set beam_idx and dst_idx based on provided values
+  if (beam_idx_val.size() == 0) {
+    for (int i = 0; i < 3; ++i) {
+      beam_idx_val.emplace_back(src_indices[i]);
+      dst_idx_val.emplace_back(dst_indices[i]);
+    }
+  } else {
+    for (int i = 0; i < 3; ++i) {
+      beam_idx_val[i] = src_indices[i];
+      dst_idx_val[i] = dst_indices[i];
+    }
+  }
+  /*
   // Retrieve KVCache states and reorder them based on the provided indices
   auto states = ovInfReq.query_state();
 
@@ -528,6 +559,7 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indic
     auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
     LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds";
   }
+  */
 }
 
 void StatefulOVInferRequest::RewindKVCache(size_t index) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index aa4b3fbe64898..8073157e4450e 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -160,6 +160,8 @@ class StatefulOVInferRequest : public OVInferRequest {
   bool prefill_use_full_chat_history = false;
   std::vector<int64_t> cached_input_ids;
   std::vector<int64_t> cached_position_ids;
+  std::vector<int64_t> beam_idx_val;
+  std::vector<int64_t> dst_idx_val;
 
   bool IsNPULogitsSliceRequired();
   bool _npu_logits_slice_required = false;
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
index fd2b5797a1f40..a3555201f3939 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -91,13 +91,21 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
   std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates);
 
   auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0];
+  auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape();
+  update_shape[2] = 3;
 
-  auto beam_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({std::move(input_batch)}));
+  auto beam_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({3}));
   beam_idx->set_friendly_name("beam_idx");
   beam_idx->output(0).get_tensor().add_names({"beam_idx"});
   ov_model->add_parameters({beam_idx});
   not_kv_inputs.push_back(beam_idx->get_friendly_name());
 
+  auto dst_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, update_shape);
+  dst_idx->set_friendly_name("dst_idx");
+  dst_idx->output(0).get_tensor().add_names({"dst_idx"});
+  ov_model->add_parameters({dst_idx});
+  not_kv_inputs.push_back(dst_idx->get_friendly_name());
+
   // Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
   for (const auto& input_name : key_value_input_names) {
     auto parameter_output_port = ov_model->input(input_name);
@@ -106,11 +114,14 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
     auto gather_op =
         std::make_shared<ov::opset13::Gather>(parameter_output_port,
                                               beam_idx,
-                                              ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim}));
+                                              ov::opset13::Constant::create(ov::element::i64, {}, {2}));
+
+    auto update_op = std::make_shared<ov::opset12::ScatterElementsUpdate>(parameter_output_port,
+        dst_idx, gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2}));
 
     // Replace the source output for all consumers of the input tensor
     for (auto& consumer : consumers) {
-      consumer.replace_source_output(gather_op->output(0));
+      consumer.replace_source_output(update_op->output(0));
     }
   }
 
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
index 0b89c4ed02e13..11b0cc1dbe9bb 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
@@ -13,6 +13,7 @@
 
 #include "openvino/pass/manager.hpp"
 #include "openvino/pass/make_stateful.hpp"
+#include "openvino/opsets/opset12.hpp"
 #include "openvino/opsets/opset13.hpp"
 
 namespace onnxruntime {

From 49ed42a61da9dd889dafb6a70cc6d08eab109f06 Mon Sep 17 00:00:00 2001
From: "Dvoretckii, Mikhail" <mikhail.dvoretckii@intel.com>
Date: Thu, 13 Nov 2025 09:47:05 -0800
Subject: [PATCH 03/12] Get variable update lengths from incoming indices

---
 .../core/providers/openvino/ov_interface.cc   | 31 ++++++++-----------
 .../openvino/ov_stateful_patch_utils.cc       |  4 +--
 2 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 0090783d1e088..2df8f8b412a91 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -467,24 +467,24 @@ std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string&
 void StatefulOVInferRequest::PreProcessInferRequest() {
   // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
   // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
-  if (beam_idx_val.size() == 3) {
-    ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {3});
-    for (int i = 0; i < 3; ++i) {
+  if (beam_idx_val.size() > 0) {
+    ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {beam_idx_val.size()});
+    for (int i = 0; i < beam_idx_val.size(); ++i) {
       beam_idx_tensor.data<int32_t>()[i] = int32_t(beam_idx_val[i]);
     }
     ovInfReq.set_tensor("beam_idx", beam_idx_tensor);
-    ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, 3, 96});
-    for (int i = 0; i < 3; ++i) {
+    ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96});
+    for (int i = 0; i < dst_idx_val.size(); ++i) {
       for (int j = 0; j < 32; ++j) {
         for (int k = 0; k < 96; ++k) {
-          dst_idx_tensor.data<int32_t>()[(j * 3 + i) * 96 + k] = int32_t(dst_idx_val[i]);
+          dst_idx_tensor.data<int32_t>()[(j * dst_idx_val.size() + i) * 96 + k] = int32_t(dst_idx_val[i]);
         }
       }
     }
     ovInfReq.set_tensor("dst_idx", dst_idx_tensor);
   } else {
-    FillTensor("beam_idx", ov::element::i32, {3}, 0);
-    FillTensor("dst_idx", ov::element::i32, {1, 32, 3, 96}, 0);
+    FillTensor("beam_idx", ov::element::i32, {0}, 0);
+    FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0);
   }
 
   // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
@@ -537,16 +537,11 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indic
                      << src_indices.size() << " index pairs";
 
   // set beam_idx and dst_idx based on provided values
-  if (beam_idx_val.size() == 0) {
-    for (int i = 0; i < 3; ++i) {
-      beam_idx_val.emplace_back(src_indices[i]);
-      dst_idx_val.emplace_back(dst_indices[i]);
-    }
-  } else {
-    for (int i = 0; i < 3; ++i) {
-      beam_idx_val[i] = src_indices[i];
-      dst_idx_val[i] = dst_indices[i];
-    }
+  beam_idx_val.clear();
+  dst_idx_val.clear();
+  for (int i = 0; i < src_indices.size(); ++i) {
+    beam_idx_val.emplace_back(src_indices[i]);
+    dst_idx_val.emplace_back(dst_indices[i]);
   }
   /*
   // Retrieve KVCache states and reorder them based on the provided indices
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
index a3555201f3939..73231024a67a2 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -90,11 +90,9 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
 
   std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates);
 
-  auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0];
   auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape();
-  update_shape[2] = 3;
 
-  auto beam_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({3}));
+  auto beam_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({update_shape[2]}));
   beam_idx->set_friendly_name("beam_idx");
   beam_idx->output(0).get_tensor().add_names({"beam_idx"});
   ov_model->add_parameters({beam_idx});

From 8dea771c48b4f52b1a37c297988a5a374398aac4 Mon Sep 17 00:00:00 2001
From: "Dvoretckii, Mikhail" <mikhail.dvoretckii@intel.com>
Date: Thu, 20 Nov 2025 09:14:57 -0800
Subject: [PATCH 04/12] Make changes to support new KVCache fusion

---
 .../core/providers/openvino/ov_interface.cc    | 18 ++++++++++--------
 .../core/providers/openvino/ov_interface.h     |  2 +-
 .../openvino/ov_stateful_patch_utils.cc        | 18 +++++++++++++++---
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 2df8f8b412a91..1bd5f74846a56 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -467,12 +467,14 @@ std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string&
 void StatefulOVInferRequest::PreProcessInferRequest() {
   // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
   // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
-  if (beam_idx_val.size() > 0) {
-    ov::Tensor beam_idx_tensor = ov::Tensor(ov::element::i32, {beam_idx_val.size()});
-    for (int i = 0; i < beam_idx_val.size(); ++i) {
-      beam_idx_tensor.data<int32_t>()[i] = int32_t(beam_idx_val[i]);
+  FillTensor("beam_idx", ov::element::i32, {1}, 0);
+
+  if (src_idx_val.size() > 0) {
+    ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {src_idx_val.size()});
+    for (int i = 0; i < src_idx_val.size(); ++i) {
+      src_idx_tensor.data<int32_t>()[i] = int32_t(src_idx_val[i]);
     }
-    ovInfReq.set_tensor("beam_idx", beam_idx_tensor);
+    ovInfReq.set_tensor("src_idx", src_idx_tensor);
     ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96});
     for (int i = 0; i < dst_idx_val.size(); ++i) {
       for (int j = 0; j < 32; ++j) {
@@ -483,7 +485,7 @@ void StatefulOVInferRequest::PreProcessInferRequest() {
     }
     ovInfReq.set_tensor("dst_idx", dst_idx_tensor);
   } else {
-    FillTensor("beam_idx", ov::element::i32, {0}, 0);
+    FillTensor("src_idx", ov::element::i32, {0}, 0);
     FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0);
   }
 
@@ -537,10 +539,10 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indic
                      << src_indices.size() << " index pairs";
 
   // set beam_idx and dst_idx based on provided values
-  beam_idx_val.clear();
+  src_idx_val.clear();
   dst_idx_val.clear();
   for (int i = 0; i < src_indices.size(); ++i) {
-    beam_idx_val.emplace_back(src_indices[i]);
+    src_idx_val.emplace_back(src_indices[i]);
     dst_idx_val.emplace_back(dst_indices[i]);
   }
   /*
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 8073157e4450e..8f3d1cd38a2b6 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -160,7 +160,7 @@ class StatefulOVInferRequest : public OVInferRequest {
   bool prefill_use_full_chat_history = false;
   std::vector<int64_t> cached_input_ids;
   std::vector<int64_t> cached_position_ids;
-  std::vector<int64_t> beam_idx_val;
+  std::vector<int64_t> src_idx_val;
   std::vector<int64_t> dst_idx_val;
 
   bool IsNPULogitsSliceRequired();
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
index 73231024a67a2..f9b055028bbb5 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -90,14 +90,21 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
 
   std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates);
 
+  auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0];
   auto update_shape = ov_model->input(key_value_input_names[0]).get_partial_shape();
 
-  auto beam_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({update_shape[2]}));
+  auto beam_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({std::move(input_batch)}));
   beam_idx->set_friendly_name("beam_idx");
   beam_idx->output(0).get_tensor().add_names({"beam_idx"});
   ov_model->add_parameters({beam_idx});
   not_kv_inputs.push_back(beam_idx->get_friendly_name());
 
+  auto src_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({update_shape[2]}));
+  src_idx->set_friendly_name("src_idx");
+  src_idx->output(0).get_tensor().add_names({"src_idx"});
+  ov_model->add_parameters({src_idx});
+  not_kv_inputs.push_back(src_idx->get_friendly_name());
+
   auto dst_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, update_shape);
   dst_idx->set_friendly_name("dst_idx");
   dst_idx->output(0).get_tensor().add_names({"dst_idx"});
@@ -112,10 +119,15 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
     auto gather_op =
         std::make_shared<ov::opset13::Gather>(parameter_output_port,
                                               beam_idx,
+                                              ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim}));
+
+    auto update_gather_op =
+        std::make_shared<ov::opset13::Gather>(gather_op,
+                                              src_idx,
                                               ov::opset13::Constant::create(ov::element::i64, {}, {2}));
 
-    auto update_op = std::make_shared<ov::opset12::ScatterElementsUpdate>(parameter_output_port,
-        dst_idx, gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2}));
+    auto update_op = std::make_shared<ov::opset12::ScatterElementsUpdate>(gather_op,
+        dst_idx, update_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2}));
 
     // Replace the source output for all consumers of the input tensor
     for (auto& consumer : consumers) {

From 6438df8bef1f1599c240323fb06e61fa3c719fed Mon Sep 17 00:00:00 2001
From: "Dvoretckii, Mikhail" <mikhail.dvoretckii@intel.com>
Date: Tue, 2 Dec 2025 03:44:43 -0800
Subject: [PATCH 05/12] Add proper include

---
 onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
index 11b0cc1dbe9bb..76a3065910ee7 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
@@ -13,6 +13,7 @@
 
 #include "openvino/pass/manager.hpp"
 #include "openvino/pass/make_stateful.hpp"
+#include "openvino/opsets/opset3.hpp"
 #include "openvino/opsets/opset12.hpp"
 #include "openvino/opsets/opset13.hpp"
 

From c76fb92587584777a45d7ba6c8cf14a944a0c372 Mon Sep 17 00:00:00 2001
From: Kotomi-Du <yaru.du@intel.com>
Date: Wed, 15 Oct 2025 18:39:00 -0700
Subject: [PATCH 06/12] add reorder KV cache API

---
 .../providers/openvino/backend_manager.cc     |  6 ++
 .../core/providers/openvino/backend_manager.h |  1 +
 .../openvino/backends/basic_backend.cc        |  6 ++
 .../openvino/backends/basic_backend.h         |  1 +
 .../core/providers/openvino/ibackend.h        |  1 +
 .../openvino/openvino_execution_provider.cc   | 62 +++++++++++++++++++
 .../core/providers/openvino/ov_interface.h    |  2 +
 7 files changed, 79 insertions(+)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index fa23f6969b633..d6d098d66242c 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -781,5 +781,11 @@ void BackendManager::RewindKVCache(size_t index) {
   }
 }
 
+void BackendManager::ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {
+  if (concrete_backend_) {
+    concrete_backend_->ReorderKVCache(src_indices, dst_indices);
+  }
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index 9f560340a2033..62cc7d95a4ef9 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -31,6 +31,7 @@ class BackendManager {
   void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data);
   ov::CompiledModel GetOVCompiledModel();
   void RewindKVCache(size_t index);
+  void ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices);
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 7c3ee7e76c3f9..9f85d42821230 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -315,6 +315,12 @@ void BasicBackend::RewindKVCache(size_t index) {
   });
 }
 
+void BasicBackend::ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {
+  infer_req_pool_->forEachIdleRequest([&](OVInferRequestPtr& infer_request) {
+    infer_request->ReorderKVCache(src_indices, dst_indices);
+  });
+}
+
 void BasicBackend::Infer(OrtKernelContext* ctx) const {
   Ort::KernelContext context(ctx);
 
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 7639e024c52cb..d2b57fdfbac84 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -137,6 +137,7 @@ class BasicBackend : public IBackend {
     return exe_network_.Get();
   }
   void RewindKVCache(size_t index) override;
+  void ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) override;
 
  private:
   bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 365a4625815d6..672fdbc218a78 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -18,6 +18,7 @@ class IBackend {
   virtual ov::CompiledModel GetOVCompiledModel() = 0;
   virtual ~IBackend() = default;
   virtual void RewindKVCache(size_t index) {}
+  virtual void ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {}
 };
 using ptr_stream_t = std::unique_ptr<ModelBlobWrapper>;
 class BackendFactory {
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index a099f85b2a4b9..b1ab21d826a2c 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -286,6 +286,68 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
           LOGS_DEFAULT(WARNING) << "kvcache_rewind index is < 0:\t" << index;
         }
       }
+    } else if (key == "kvcache_reorder") {
+    // Convert kvcache_reorder value format "1,2,3;4,5,6" into two vectors
+      // src_indices = [1,2,3], dst_indices = [4,5,6]
+      size_t delimiter_pos = value.find(';');
+      if (delimiter_pos == std::string::npos) {
+        LOGS_DEFAULT(WARNING) << "kvcache_reorder value format is incorrect, expected format is 'x1,x2,x3;y1,y2,y3' where x and y are comma-separated int64_t lists";
+        return Status::OK();
+      }
+
+      std::string src_string = value.substr(0, delimiter_pos);
+      std::string dst_string = value.substr(delimiter_pos + 1);
+
+      std::vector<size_t> src_indices;
+      std::vector<size_t> dst_indices;
+
+      try {
+        // Parse source indices from comma-separated string
+        std::stringstream src_stream(src_string);
+        std::string src_token;
+        while (std::getline(src_stream, src_token, ',')) {
+          // Trim whitespace
+          src_token.erase(0, src_token.find_first_not_of(" \t"));
+          src_token.erase(src_token.find_last_not_of(" \t") + 1);
+
+          if (!src_token.empty()) {
+            int64_t index = std::stoll(src_token);
+            if (index >= 0) {
+              src_indices.push_back(static_cast<size_t>(index));
+            } else {
+              LOGS_DEFAULT(WARNING) << "kvcache_reorder src_index is < 0: " << index;
+            }
+          }
+        }
+
+        // Parse destination indices from comma-separated string
+        std::stringstream dst_stream(dst_string);
+        std::string dst_token;
+        while (std::getline(dst_stream, dst_token, ',')) {
+          // Trim whitespace
+          dst_token.erase(0, dst_token.find_first_not_of(" \t"));
+          dst_token.erase(dst_token.find_last_not_of(" \t") + 1);
+
+          if (!dst_token.empty()) {
+            int64_t index = std::stoll(dst_token);
+            if (index >= 0) {
+              dst_indices.push_back(static_cast<size_t>(index));
+            } else {
+              LOGS_DEFAULT(WARNING) << "kvcache_reorder dst_index is < 0: " << index;
+            }
+          }
+        }
+
+      } catch (const std::exception& e) {
+        LOGS_DEFAULT(WARNING) << "Conversion for kvcache_reorder string value to int64_t indices failed. "
+                              << "Exception: " << e.what();
+        return Status::OK();
+      }
+
+      // Trigger KVCache Reorder for target Backend with vector arguments
+      for (auto& backend : backend_managers_) {
+        backend.ReorderKVCache(src_indices, dst_indices);
+      }
     } else {
       // Handle unknown options
       LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 8f3d1cd38a2b6..6a001533ff26c 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -136,6 +136,7 @@ class OVInferRequest {
     return ovInfReq;
   }
   virtual void RewindKVCache([[maybe_unused]] size_t index) {}
+  virtual void ReorderKVCache([[maybe_unused]] const std::vector<size_t>& src_indices,  [[maybe_unused]] const std::vector<size_t>& dst_indices) {}
 };
 
 class StatefulOVInferRequest : public OVInferRequest {
@@ -144,6 +145,7 @@ class StatefulOVInferRequest : public OVInferRequest {
 
   void Infer() override;
   void RewindKVCache(size_t index) override;
+  void ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) override;
   void FillTensor(const std::string& tensor_name, const ov::element::Type& type,
                   const std::vector<size_t>& shape, int32_t fill_value);
   void CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache);

From 13a9f01c2c9798ef85293070935d58e0eb528ff3 Mon Sep 17 00:00:00 2001
From: Kotomi-Du <yaru.du@intel.com>
Date: Fri, 5 Dec 2025 15:02:51 -0800
Subject: [PATCH 07/12] clean up code

---
 .../core/providers/openvino/ov_interface.cc   | 35 ++++++-------------
 .../core/providers/openvino/ov_interface.h    |  4 +--
 .../openvino/ov_stateful_patch_utils.cc       |  8 ++---
 .../openvino/ov_stateful_patch_utils.h        |  1 -
 4 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 1bd5f74846a56..b6afda190c444 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -469,17 +469,17 @@ void StatefulOVInferRequest::PreProcessInferRequest() {
   // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
   FillTensor("beam_idx", ov::element::i32, {1}, 0);
 
-  if (src_idx_val.size() > 0) {
-    ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {src_idx_val.size()});
-    for (int i = 0; i < src_idx_val.size(); ++i) {
-      src_idx_tensor.data<int32_t>()[i] = int32_t(src_idx_val[i]);
+  if (kv_src_indices.size() > 0) {
+    ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()});
+    for (int i = 0; i < kv_src_indices.size(); ++i) {
+      src_idx_tensor.data<int32_t>()[i] = int32_t(kv_src_indices[i]);
     }
     ovInfReq.set_tensor("src_idx", src_idx_tensor);
-    ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, dst_idx_val.size(), 96});
-    for (int i = 0; i < dst_idx_val.size(); ++i) {
+    ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, kv_dst_indices.size(), 96});
+    for (int i = 0; i < kv_dst_indices.size(); ++i) {
       for (int j = 0; j < 32; ++j) {
         for (int k = 0; k < 96; ++k) {
-          dst_idx_tensor.data<int32_t>()[(j * dst_idx_val.size() + i) * 96 + k] = int32_t(dst_idx_val[i]);
+          dst_idx_tensor.data<int32_t>()[(j * kv_dst_indices.size() + i) * 96 + k] = int32_t(kv_dst_indices[i]);
         }
       }
     }
@@ -538,25 +538,12 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indic
   LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with "
                      << src_indices.size() << " index pairs";
 
-  // set beam_idx and dst_idx based on provided values
-  src_idx_val.clear();
-  dst_idx_val.clear();
+  kv_src_indices.clear();
+  kv_dst_indices.clear();
   for (int i = 0; i < src_indices.size(); ++i) {
-    src_idx_val.emplace_back(src_indices[i]);
-    dst_idx_val.emplace_back(dst_indices[i]);
+    kv_src_indices.emplace_back(src_indices[i]);
+    kv_dst_indices.emplace_back(dst_indices[i]);
   }
-  /*
-  // Retrieve KVCache states and reorder them based on the provided indices
-  auto states = ovInfReq.query_state();
-
-  for (auto& state : states) {
-    auto start_time = std::chrono::high_resolution_clock::now();
-    state.gather_by_axis(src_indices, dst_indices);
-    auto end_time = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
-    LOGS_DEFAULT(INFO) << log_tag << "gather_by_axis: " << duration << " microseconds";
-  }
-  */
 }
 
 void StatefulOVInferRequest::RewindKVCache(size_t index) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 6a001533ff26c..4018aedea3094 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -162,8 +162,8 @@ class StatefulOVInferRequest : public OVInferRequest {
   bool prefill_use_full_chat_history = false;
   std::vector<int64_t> cached_input_ids;
   std::vector<int64_t> cached_position_ids;
-  std::vector<int64_t> src_idx_val;
-  std::vector<int64_t> dst_idx_val;
+  std::vector<int64_t> kv_src_indices;
+  std::vector<int64_t> kv_dst_indices;
 
   bool IsNPULogitsSliceRequired();
   bool _npu_logits_slice_required = false;
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
index f9b055028bbb5..7f2aa4a5cfa3f 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -121,17 +121,17 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                                               beam_idx,
                                               ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim}));
 
-    auto update_gather_op =
+    auto updatekv_gather_op =
         std::make_shared<ov::opset13::Gather>(gather_op,
                                               src_idx,
                                               ov::opset13::Constant::create(ov::element::i64, {}, {2}));
 
-    auto update_op = std::make_shared<ov::opset12::ScatterElementsUpdate>(gather_op,
-        dst_idx, update_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2}));
+    auto updatekv_op = std::make_shared<ov::opset12::ScatterElementsUpdate>(gather_op,
+        dst_idx, updatekv_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2}));
 
     // Replace the source output for all consumers of the input tensor
     for (auto& consumer : consumers) {
-      consumer.replace_source_output(update_op->output(0));
+      consumer.replace_source_output(updatekv_op->output(0));
     }
   }
 
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
index 76a3065910ee7..11b0cc1dbe9bb 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
@@ -13,7 +13,6 @@
 
 #include "openvino/pass/manager.hpp"
 #include "openvino/pass/make_stateful.hpp"
-#include "openvino/opsets/opset3.hpp"
 #include "openvino/opsets/opset12.hpp"
 #include "openvino/opsets/opset13.hpp"
 

From 101102db4779d519defe34487711de0366604011 Mon Sep 17 00:00:00 2001
From: Kotomi-Du <yaru.du@intel.com>
Date: Mon, 8 Dec 2025 21:29:45 -0800
Subject: [PATCH 08/12] add post process for internal handled inputs

---
 onnxruntime/core/providers/openvino/ov_interface.cc | 6 ++++++
 onnxruntime/core/providers/openvino/ov_interface.h  | 1 +
 2 files changed, 7 insertions(+)

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index b6afda190c444..cb93d54edbbb2 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -525,6 +525,12 @@ void StatefulOVInferRequest::PreProcessInferRequest() {
 void StatefulOVInferRequest::Infer() {
   PreProcessInferRequest();
   OVInferRequest::Infer();
+  PostProcessInferRequest();
+}
+
+void StatefulOVInferRequest::PostProcessInferRequest() {
+    kv_src_indices.clear();
+    kv_dst_indices.clear();
 }
 
 void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 4018aedea3094..a352456f4ac41 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -155,6 +155,7 @@ class StatefulOVInferRequest : public OVInferRequest {
 
  private:
   void PreProcessInferRequest();
+  void PostProcessInferRequest();
   std::string target_device;
 
   // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors,

From 59bd56ebd00112a3722849650a6a597d13749b06 Mon Sep 17 00:00:00 2001
From: Kotomi-Du <yaru.du@intel.com>
Date: Thu, 11 Dec 2025 20:53:40 -0800
Subject: [PATCH 09/12] disable update_kvcache for npu + pass kv info

---
 .../openvino/openvino_execution_provider.cc   |  2 +-
 .../core/providers/openvino/ov_interface.cc   | 17 ++---
 .../openvino/ov_stateful_patch_utils.cc       | 62 ++++++++++++-------
 .../openvino/ov_stateful_patch_utils.h        |  5 +-
 4 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index b1ab21d826a2c..00a04aac11ab1 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -287,7 +287,7 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
         }
       }
     } else if (key == "kvcache_reorder") {
-    // Convert kvcache_reorder value format "1,2,3;4,5,6" into two vectors
+      // Convert kvcache_reorder value format "1,2,3;4,5,6" into two vectors
       // src_indices = [1,2,3], dst_indices = [4,5,6]
       size_t delimiter_pos = value.find(';');
       if (delimiter_pos == std::string::npos) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index cb93d54edbbb2..b9c9fd0738cdb 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -111,7 +111,7 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
   LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False");
   if (!model_status) {
     LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
-    PatchStatefulDecoder(model);
+    PatchStatefulDecoder(model, hw_target);
   }
 
   if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
@@ -468,25 +468,28 @@ void StatefulOVInferRequest::PreProcessInferRequest() {
   // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
   // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
   FillTensor("beam_idx", ov::element::i32, {1}, 0);
-
+  ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape();
+  uint64_t kv_num_heads = dst_idx_shape[1];
+  uint64_t kv_head_size = dst_idx_shape[3];
   if (kv_src_indices.size() > 0) {
     ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()});
     for (int i = 0; i < kv_src_indices.size(); ++i) {
       src_idx_tensor.data<int32_t>()[i] = int32_t(kv_src_indices[i]);
     }
     ovInfReq.set_tensor("src_idx", src_idx_tensor);
-    ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, 32, kv_dst_indices.size(), 96});
+
+    ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size});
     for (int i = 0; i < kv_dst_indices.size(); ++i) {
-      for (int j = 0; j < 32; ++j) {
-        for (int k = 0; k < 96; ++k) {
-          dst_idx_tensor.data<int32_t>()[(j * kv_dst_indices.size() + i) * 96 + k] = int32_t(kv_dst_indices[i]);
+      for (int j = 0; j < kv_num_heads; ++j) {
+        for (int k = 0; k < kv_head_size; ++k) {
+          dst_idx_tensor.data<int32_t>()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]);
         }
       }
     }
     ovInfReq.set_tensor("dst_idx", dst_idx_tensor);
   } else {
     FillTensor("src_idx", ov::element::i32, {0}, 0);
-    FillTensor("dst_idx", ov::element::i32, {1, 32, 0, 96}, 0);
+    FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0);
   }
 
   // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
index 7f2aa4a5cfa3f..31b6c364f7b89 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -75,11 +75,16 @@ std::string GetInputOutputName(std::shared_ptr<ov::Model> ov_model,
 void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                       std::vector<std::string>& not_kv_inputs,
                       const std::vector<std::string>& key_value_input_names,
-                      int gather_dim) {
+                      int gather_dim,
+                      const std::string& device) {
   if (ModelHasInputOutputNames(ov_model, "beam_idx")) {
     throw std::runtime_error("Model already has fused cache");
   }
 
+  // Flag to add Gather+ScatterElementsUpdate subgraph for LLM speculative decoding
+  // TO-DO: extend to NPU device when OpenVINO NPU has related optimization
+  bool is_support_speculative_LLM = device.find("GPU") != std::string::npos;
+
   // Define input name candidates in priority order
   const std::vector<std::string> input_name_candidates = {
       "inputs_embeds",                       // Default fallback
@@ -99,17 +104,22 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
   ov_model->add_parameters({beam_idx});
   not_kv_inputs.push_back(beam_idx->get_friendly_name());
 
-  auto src_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({update_shape[2]}));
-  src_idx->set_friendly_name("src_idx");
-  src_idx->output(0).get_tensor().add_names({"src_idx"});
-  ov_model->add_parameters({src_idx});
-  not_kv_inputs.push_back(src_idx->get_friendly_name());
-
-  auto dst_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, update_shape);
-  dst_idx->set_friendly_name("dst_idx");
-  dst_idx->output(0).get_tensor().add_names({"dst_idx"});
-  ov_model->add_parameters({dst_idx});
-  not_kv_inputs.push_back(dst_idx->get_friendly_name());
+  std::shared_ptr<ov::opset13::Parameter> src_idx;
+  std::shared_ptr<ov::opset13::Parameter> dst_idx;
+
+  if (is_support_speculative_LLM) {
+    src_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({update_shape[2]}));
+    src_idx->set_friendly_name("src_idx");
+    src_idx->output(0).get_tensor().add_names({"src_idx"});
+    ov_model->add_parameters({src_idx});
+    not_kv_inputs.push_back(src_idx->get_friendly_name());
+
+    dst_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, update_shape);
+    dst_idx->set_friendly_name("dst_idx");
+    dst_idx->output(0).get_tensor().add_names({"dst_idx"});
+    ov_model->add_parameters({dst_idx});
+    not_kv_inputs.push_back(dst_idx->get_friendly_name());
+  }
 
   // Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
   for (const auto& input_name : key_value_input_names) {
@@ -121,17 +131,25 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                                               beam_idx,
                                               ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim}));
 
-    auto updatekv_gather_op =
-        std::make_shared<ov::opset13::Gather>(gather_op,
-                                              src_idx,
-                                              ov::opset13::Constant::create(ov::element::i64, {}, {2}));
-
-    auto updatekv_op = std::make_shared<ov::opset12::ScatterElementsUpdate>(gather_op,
-        dst_idx, updatekv_gather_op, ov::opset13::Constant::create(ov::element::i64, {}, {2}));
+    std::shared_ptr<ov::Node> output_node;
+    if (is_support_speculative_LLM) {
+      auto updatekv_gather_op =
+          std::make_shared<ov::opset13::Gather>(gather_op,
+                                                src_idx,
+                                                ov::opset13::Constant::create(ov::element::i64, {}, {2}));
+
+      auto updatekv_op = std::make_shared<ov::opset12::ScatterElementsUpdate>(gather_op,
+                                                                               dst_idx,
+                                                                               updatekv_gather_op,
+                                                                               ov::opset13::Constant::create(ov::element::i64, {}, {2}));
+      output_node = updatekv_op;
+    } else {
+      output_node = gather_op;
+    }
 
     // Replace the source output for all consumers of the input tensor
     for (auto& consumer : consumers) {
-      consumer.replace_source_output(updatekv_op->output(0));
+      consumer.replace_source_output(output_node->output(0));
     }
   }
 
@@ -268,7 +286,7 @@ std::pair<std::vector<std::string>, std::vector<std::string>> ExtractInputKVTens
 }
 
 // Updated PatchStatefulDecoder function
-void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
+void PatchStatefulDecoder(std::shared_ptr<ov::Model> model, const std::string& device) {
   // Use the dynamic pattern-based extraction logic
   auto [key_value_output_names, extracted_patterns] = ExtractKVPatternsFromOutputs(model);
   auto [key_value_input_names, not_kv_inputs] = ExtractInputKVTensors(model, extracted_patterns);
@@ -290,7 +308,7 @@ void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
   // batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0
   auto batch_dim = 0;
 
-  FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim);
+  FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim, device);
 
   MakeStateful(model, key_value_input_names, key_value_output_names);
 }
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
index 11b0cc1dbe9bb..ce7db01063426 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
@@ -26,13 +26,14 @@ bool ModelHasInputOutputNames(std::shared_ptr<ov::Model> model, const std::strin
 void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                       std::vector<std::string>& not_kv_inputs,
                       const std::vector<std::string>& key_value_input_names,
-                      int gather_dim);
+                      int gather_dim,
+                      const std::string& device = "");
 
 void MakeStateful(std::shared_ptr<ov::Model>& ov_model,
                   const std::vector<std::string>& key_value_input_names,
                   const std::vector<std::string>& key_value_output_names);
 
-void PatchStatefulDecoder(std::shared_ptr<ov::Model> model);
+void PatchStatefulDecoder(std::shared_ptr<ov::Model> model, const std::string& device = "");
 
 bool HasOpWithType(const std::shared_ptr<const ov::Model>& function, const std::string& type_name);
 

From 7ea12e74aa860d62cf7d11a78f00c72c3f86e210 Mon Sep 17 00:00:00 2001
From: Kotomi-Du <yaru.du@intel.com>
Date: Thu, 11 Dec 2025 22:08:58 -0800
Subject: [PATCH 10/12] refactor code

---
 .../openvino/openvino_execution_provider.cc   | 75 +++++++++----------
 .../core/providers/openvino/ov_interface.cc   |  8 +-
 2 files changed, 39 insertions(+), 44 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 00a04aac11ab1..cc22fddab10f6 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -291,57 +291,52 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
       // src_indices = [1,2,3], dst_indices = [4,5,6]
       size_t delimiter_pos = value.find(';');
       if (delimiter_pos == std::string::npos) {
-        LOGS_DEFAULT(WARNING) << "kvcache_reorder value format is incorrect, expected format is 'x1,x2,x3;y1,y2,y3' where x and y are comma-separated int64_t lists";
-        return Status::OK();
+        return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                     "kvcache_reorder value format is incorrect, expected format is 'x1,x2,x3;y1,y2,y3' where x and y are comma-separated int64_t lists");
       }
 
       std::string src_string = value.substr(0, delimiter_pos);
       std::string dst_string = value.substr(delimiter_pos + 1);
 
-      std::vector<size_t> src_indices;
-      std::vector<size_t> dst_indices;
+      auto parse_indices = [](const std::string& input, const std::string& index_type) -> std::pair<Status, std::vector<size_t>> {
+        std::vector<size_t> indices;
+        std::stringstream stream(input);
+        std::string token;
 
-      try {
-        // Parse source indices from comma-separated string
-        std::stringstream src_stream(src_string);
-        std::string src_token;
-        while (std::getline(src_stream, src_token, ',')) {
-          // Trim whitespace
-          src_token.erase(0, src_token.find_first_not_of(" \t"));
-          src_token.erase(src_token.find_last_not_of(" \t") + 1);
-
-          if (!src_token.empty()) {
-            int64_t index = std::stoll(src_token);
-            if (index >= 0) {
-              src_indices.push_back(static_cast<size_t>(index));
-            } else {
-              LOGS_DEFAULT(WARNING) << "kvcache_reorder src_index is < 0: " << index;
+        try {
+          while (std::getline(stream, token, ',')) {
+            // Trim whitespace
+            token.erase(0, token.find_first_not_of(" \t"));
+            token.erase(token.find_last_not_of(" \t") + 1);
+
+            if (!token.empty()) {
+              int64_t index = std::stoll(token);
+              if (index >= 0) {
+                indices.push_back(static_cast<size_t>(index));
+              } else {
+                return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                              "kvcache_reorder " + index_type + " cannot be negative: " + std::to_string(index)),
+                        std::vector<size_t>()};
+              }
             }
           }
+        } catch (const std::exception& e) {
+          return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                        "Failed to parse kvcache_reorder " + index_type + ": " + std::string(e.what())),
+                  std::vector<size_t>()};
         }
 
-        // Parse destination indices from comma-separated string
-        std::stringstream dst_stream(dst_string);
-        std::string dst_token;
-        while (std::getline(dst_stream, dst_token, ',')) {
-          // Trim whitespace
-          dst_token.erase(0, dst_token.find_first_not_of(" \t"));
-          dst_token.erase(dst_token.find_last_not_of(" \t") + 1);
-
-          if (!dst_token.empty()) {
-            int64_t index = std::stoll(dst_token);
-            if (index >= 0) {
-              dst_indices.push_back(static_cast<size_t>(index));
-            } else {
-              LOGS_DEFAULT(WARNING) << "kvcache_reorder dst_index is < 0: " << index;
-            }
-          }
-        }
+        return {Status::OK(), std::move(indices)};
+      };
 
-      } catch (const std::exception& e) {
-        LOGS_DEFAULT(WARNING) << "Conversion for kvcache_reorder string value to int64_t indices failed. "
-                              << "Exception: " << e.what();
-        return Status::OK();
+      auto [src_status, src_indices] = parse_indices(src_string, "src_index");
+      if (!src_status.IsOK()) {
+        return src_status;
+      }
+
+      auto [dst_status, dst_indices] = parse_indices(dst_string, "dst_index");
+      if (!dst_status.IsOK()) {
+        return dst_status;
       }
 
       // Trigger KVCache Reorder for target Backend with vector arguments
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index b9c9fd0738cdb..db8603f0fab47 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -473,15 +473,15 @@ void StatefulOVInferRequest::PreProcessInferRequest() {
   uint64_t kv_head_size = dst_idx_shape[3];
   if (kv_src_indices.size() > 0) {
     ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()});
-    for (int i = 0; i < kv_src_indices.size(); ++i) {
+    for (auto i = 0; i < kv_src_indices.size(); ++i) {
       src_idx_tensor.data<int32_t>()[i] = int32_t(kv_src_indices[i]);
     }
     ovInfReq.set_tensor("src_idx", src_idx_tensor);
 
     ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size});
-    for (int i = 0; i < kv_dst_indices.size(); ++i) {
-      for (int j = 0; j < kv_num_heads; ++j) {
-        for (int k = 0; k < kv_head_size; ++k) {
+    for (auto i = 0; i < kv_dst_indices.size(); ++i) {
+      for (auto j = 0; j < kv_num_heads; ++j) {
+        for (auto k = 0; k < kv_head_size; ++k) {
           dst_idx_tensor.data<int32_t>()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]);
         }
       }

From ab95e39f9b84dfc2a1643ff12063717bb76677f1 Mon Sep 17 00:00:00 2001
From: Kotomi-Du <yaru.du@intel.com>
Date: Thu, 11 Dec 2025 22:33:46 -0800
Subject: [PATCH 11/12] minor change

---
 .../core/providers/openvino/ov_interface.cc   | 48 +++++++++++--------
 .../core/providers/openvino/ov_interface.h    |  2 +
 .../openvino/ov_stateful_patch_utils.cc       |  8 ++--
 3 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index db8603f0fab47..be008e6d5617e 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -380,6 +380,7 @@ void OVInferRequest::Infer() {
 StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
     : OVInferRequest(std::move(infer_request)), target_device(device) {
   bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
+  is_support_kvcache_reorder = device.find("GPU") != std::string::npos;
 
   _npu_logits_slice_required = IsNPULogitsSliceRequired();
 
@@ -468,28 +469,31 @@ void StatefulOVInferRequest::PreProcessInferRequest() {
   // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
   // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
   FillTensor("beam_idx", ov::element::i32, {1}, 0);
-  ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape();
-  uint64_t kv_num_heads = dst_idx_shape[1];
-  uint64_t kv_head_size = dst_idx_shape[3];
-  if (kv_src_indices.size() > 0) {
-    ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()});
-    for (auto i = 0; i < kv_src_indices.size(); ++i) {
-      src_idx_tensor.data<int32_t>()[i] = int32_t(kv_src_indices[i]);
-    }
-    ovInfReq.set_tensor("src_idx", src_idx_tensor);
 
-    ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size});
-    for (auto i = 0; i < kv_dst_indices.size(); ++i) {
-      for (auto j = 0; j < kv_num_heads; ++j) {
-        for (auto k = 0; k < kv_head_size; ++k) {
-          dst_idx_tensor.data<int32_t>()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]);
+  if (is_support_kvcache_reorder){
+      ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape();
+      uint64_t kv_num_heads = dst_idx_shape[1];
+      uint64_t kv_head_size = dst_idx_shape[3];
+      if (kv_src_indices.size() > 0) {
+        ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()});
+        for (auto i = 0; i < kv_src_indices.size(); ++i) {
+          src_idx_tensor.data<int32_t>()[i] = int32_t(kv_src_indices[i]);
+        }
+        ovInfReq.set_tensor("src_idx", src_idx_tensor);
+
+        ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size});
+        for (auto i = 0; i < kv_dst_indices.size(); ++i) {
+          for (auto j = 0; j < kv_num_heads; ++j) {
+            for (auto k = 0; k < kv_head_size; ++k) {
+              dst_idx_tensor.data<int32_t>()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]);
+            }
+          }
         }
+        ovInfReq.set_tensor("dst_idx", dst_idx_tensor);
+      } else {
+        FillTensor("src_idx", ov::element::i32, {0}, 0);
+        FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0);
       }
-    }
-    ovInfReq.set_tensor("dst_idx", dst_idx_tensor);
-  } else {
-    FillTensor("src_idx", ov::element::i32, {0}, 0);
-    FillTensor("dst_idx", ov::element::i32, {1, kv_num_heads, 0, kv_head_size}, 0);
   }
 
   // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
@@ -532,8 +536,10 @@ void StatefulOVInferRequest::Infer() {
 }
 
 void StatefulOVInferRequest::PostProcessInferRequest() {
-    kv_src_indices.clear();
-    kv_dst_indices.clear();
+  if(is_support_kvcache_reorder){
+      kv_src_indices.clear();
+      kv_dst_indices.clear();
+    }
 }
 
 void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index a352456f4ac41..f2de48cfe35fd 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -163,6 +163,8 @@ class StatefulOVInferRequest : public OVInferRequest {
   bool prefill_use_full_chat_history = false;
   std::vector<int64_t> cached_input_ids;
   std::vector<int64_t> cached_position_ids;
+
+  bool is_support_kvcache_reorder = false;
   std::vector<int64_t> kv_src_indices;
   std::vector<int64_t> kv_dst_indices;
 
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
index 31b6c364f7b89..0b4f828c7cbb6 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -81,9 +81,9 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
     throw std::runtime_error("Model already has fused cache");
   }
 
-  // Flag to add Gather+ScatterElementsUpdate subgraph for LLM speculative decoding
+  // Flag to add Gather+ScatterElementsUpdate subgraph to reorder KV cache for LLM speculative decoding
   // TO-DO: extend to NPU device when OpenVINO NPU has related optimization
-  bool is_support_speculative_LLM = device.find("GPU") != std::string::npos;
+  bool is_support_kvcache_reorder = device.find("GPU") != std::string::npos;
 
   // Define input name candidates in priority order
   const std::vector<std::string> input_name_candidates = {
@@ -107,7 +107,7 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
   std::shared_ptr<ov::opset13::Parameter> src_idx;
   std::shared_ptr<ov::opset13::Parameter> dst_idx;
 
-  if (is_support_speculative_LLM) {
+  if (is_support_kvcache_reorder) {
     src_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({update_shape[2]}));
     src_idx->set_friendly_name("src_idx");
     src_idx->output(0).get_tensor().add_names({"src_idx"});
@@ -132,7 +132,7 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                                               ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim}));
 
     std::shared_ptr<ov::Node> output_node;
-    if (is_support_speculative_LLM) {
+    if (is_support_kvcache_reorder) {
       auto updatekv_gather_op =
           std::make_shared<ov::opset13::Gather>(gather_op,
                                                 src_idx,

From 166586572c93014da65581b09dda2de6bf9efe97 Mon Sep 17 00:00:00 2001
From: czekun <chen.zekun@intel.com>
Date: Tue, 16 Dec 2025 21:48:53 -0800
Subject: [PATCH 12/12] refactor with int32 indices, string_view parsing. move
 fuse flag in exenetwork.

---
 .../providers/openvino/backend_manager.cc     |  2 +-
 .../core/providers/openvino/backend_manager.h |  2 +-
 .../openvino/backends/basic_backend.cc        |  2 +-
 .../openvino/backends/basic_backend.h         |  2 +-
 .../core/providers/openvino/ibackend.h        |  2 +-
 .../openvino/openvino_execution_provider.cc   | 76 ++++++++++---------
 .../core/providers/openvino/ov_interface.cc   | 47 ++++++------
 .../core/providers/openvino/ov_interface.h    | 24 +++---
 .../openvino/ov_stateful_patch_utils.cc       | 20 ++---
 .../openvino/ov_stateful_patch_utils.h        |  4 +-
 10 files changed, 90 insertions(+), 91 deletions(-)

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index d6d098d66242c..5e80ee3738ed8 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -781,7 +781,7 @@ void BackendManager::RewindKVCache(size_t index) {
   }
 }
 
-void BackendManager::ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {
+void BackendManager::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
   if (concrete_backend_) {
     concrete_backend_->ReorderKVCache(src_indices, dst_indices);
   }
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index 62cc7d95a4ef9..f8a74b9cbcfa4 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -31,7 +31,7 @@ class BackendManager {
   void TryExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, bool include_embed_data);
   ov::CompiledModel GetOVCompiledModel();
   void RewindKVCache(size_t index);
-  void ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices);
+  void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices);
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 9f85d42821230..7f4d1f74cfb7b 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -315,7 +315,7 @@ void BasicBackend::RewindKVCache(size_t index) {
   });
 }
 
-void BasicBackend::ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {
+void BasicBackend::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
   infer_req_pool_->forEachIdleRequest([&](OVInferRequestPtr& infer_request) {
     infer_request->ReorderKVCache(src_indices, dst_indices);
   });
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index d2b57fdfbac84..c7505d59eec0c 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -137,7 +137,7 @@ class BasicBackend : public IBackend {
     return exe_network_.Get();
   }
   void RewindKVCache(size_t index) override;
-  void ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) override;
+  void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
 
  private:
   bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 672fdbc218a78..4444f37ac7433 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -18,7 +18,7 @@ class IBackend {
   virtual ov::CompiledModel GetOVCompiledModel() = 0;
   virtual ~IBackend() = default;
   virtual void RewindKVCache(size_t index) {}
-  virtual void ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {}
+  virtual void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {}
 };
 using ptr_stream_t = std::unique_ptr<ModelBlobWrapper>;
 class BackendFactory {
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index cc22fddab10f6..b7b0894d7bff7 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -5,6 +5,7 @@
 #include <string>
 #include <memory>
 #include <vector>
+#include <cerrno>
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
 #include "core/providers/openvino/contexts.h"
@@ -295,53 +296,54 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
                      "kvcache_reorder value format is incorrect, expected format is 'x1,x2,x3;y1,y2,y3' where x and y are comma-separated int64_t lists");
       }
 
-      std::string src_string = value.substr(0, delimiter_pos);
-      std::string dst_string = value.substr(delimiter_pos + 1);
-
-      auto parse_indices = [](const std::string& input, const std::string& index_type) -> std::pair<Status, std::vector<size_t>> {
-        std::vector<size_t> indices;
-        std::stringstream stream(input);
-        std::string token;
-
-        try {
-          while (std::getline(stream, token, ',')) {
-            // Trim whitespace
-            token.erase(0, token.find_first_not_of(" \t"));
-            token.erase(token.find_last_not_of(" \t") + 1);
-
-            if (!token.empty()) {
-              int64_t index = std::stoll(token);
-              if (index >= 0) {
-                indices.push_back(static_cast<size_t>(index));
-              } else {
-                return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                              "kvcache_reorder " + index_type + " cannot be negative: " + std::to_string(index)),
-                        std::vector<size_t>()};
-              }
-            }
+      std::string_view src_string(value.begin(), value.begin() + delimiter_pos);
+      std::string_view dst_string(value.begin() + delimiter_pos + 1, value.end());
+
+      constexpr auto parse_indices = [](std::string_view input, const std::string& index_type) -> std::variant<Status, std::vector<int32_t>> {
+        std::vector<int32_t> indices;
+        while (!input.empty()) {
+          const auto delimiter_pos = input.find(',');
+          const auto part = input.substr(0, delimiter_pos);
+          errno = 0;
+          char* parse_end = nullptr;
+          // strtoll/stoll already skips whitespaces
+          const auto index = std::strtol(part.data(), &parse_end, 10);
+          if (parse_end == part.data()) {
+            return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                          "Failed to parse kvcache_reorder " + index_type + ": " + std::string(part));
+          }
+          if (index < 0) {
+            return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                          "kvcache_reorder " + index_type + " cannot be negative: " + std::string(part));
+          }
+          if (errno == ERANGE) {
+            return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                          "kvcache_reorder " + index_type + " exceed INT32_MAX: " + std::string(part));
+          }
+          indices.push_back(static_cast<int32_t>(index));
+          if (delimiter_pos != std::string_view::npos) {
+            // ignore any trailing chars after the number, can do futher checking if needed
+            input.remove_prefix(part.size() + 1);
+          } else {
+            break;
           }
-        } catch (const std::exception& e) {
-          return {Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                        "Failed to parse kvcache_reorder " + index_type + ": " + std::string(e.what())),
-                  std::vector<size_t>()};
         }
-
-        return {Status::OK(), std::move(indices)};
+        return indices;
       };
 
-      auto [src_status, src_indices] = parse_indices(src_string, "src_index");
-      if (!src_status.IsOK()) {
-        return src_status;
+      const auto src_indices = parse_indices(src_string, "src_index");
+      if (src_indices.index() == 0) {
+        return std::get<0>(src_indices);
       }
 
-      auto [dst_status, dst_indices] = parse_indices(dst_string, "dst_index");
-      if (!dst_status.IsOK()) {
-        return dst_status;
+      const auto dst_indices = parse_indices(dst_string, "dst_index");
+      if (dst_indices.index() == 0) {
+        return std::get<0>(dst_indices);
       }
 
       // Trigger KVCache Reorder for target Backend with vector arguments
       for (auto& backend : backend_managers_) {
-        backend.ReorderKVCache(src_indices, dst_indices);
+        backend.ReorderKVCache(std::get<1>(src_indices), std::get<1>(dst_indices));
       }
     } else {
       // Handle unknown options
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index be008e6d5617e..5255729478dbe 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -109,9 +109,13 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
 
   bool model_status = IsStateful(model);
   LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False");
+  // Flag to add Gather+ScatterElementsUpdate subgraph to reorder KV cache for LLM speculative decoding
+  bool is_fused_kvcache_reorder = false;
   if (!model_status) {
     LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
-    PatchStatefulDecoder(model, hw_target);
+    // TO-DO: extend to NPU device when OpenVINO NPU has related optimization
+    is_fused_kvcache_reorder = hw_target.find("GPU") != std::string::npos;
+    PatchStatefulDecoder(model, is_fused_kvcache_reorder);
   }
 
   if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
@@ -152,7 +156,7 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
 
   LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow";
   compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
-  OVExeNetwork exe(compiled_model, hw_target, true);
+  OVExeNetwork exe(compiled_model, hw_target, true, is_fused_kvcache_reorder);
   return exe;
 }
 
@@ -332,7 +336,7 @@ std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
     auto infReq = compiled_model_obj.create_infer_request();
     std::shared_ptr<OVInferRequest> ovInfReq;
     if (is_stateful_causallm) {
-      ovInfReq = std::make_shared<StatefulOVInferRequest>(std::move(infReq), target_device);
+      ovInfReq = std::make_shared<StatefulOVInferRequest>(std::move(infReq), target_device, is_fused_kvcache_reorder);
     } else {
       ovInfReq = std::make_shared<OVInferRequest>(std::move(infReq));
     }
@@ -377,10 +381,9 @@ void OVInferRequest::Infer() {
                              "In Error Couldn't start Inference");
 }
 
-StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
-    : OVInferRequest(std::move(infer_request)), target_device(device) {
+StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device, bool fused_kvcache_reorder)
+    : OVInferRequest(std::move(infer_request)), target_device(device), is_fused_kvcache_reorder(fused_kvcache_reorder) {
   bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
-  is_support_kvcache_reorder = device.find("GPU") != std::string::npos;
 
   _npu_logits_slice_required = IsNPULogitsSliceRequired();
 
@@ -470,23 +473,23 @@ void StatefulOVInferRequest::PreProcessInferRequest() {
   // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
   FillTensor("beam_idx", ov::element::i32, {1}, 0);
 
-  if (is_support_kvcache_reorder){
+  if (is_fused_kvcache_reorder){
       ov::Shape dst_idx_shape = ovInfReq.get_tensor("dst_idx").get_shape();
-      uint64_t kv_num_heads = dst_idx_shape[1];
-      uint64_t kv_head_size = dst_idx_shape[3];
+      const auto kv_num_heads = dst_idx_shape[1];
+      const auto kv_head_size = dst_idx_shape[3];
       if (kv_src_indices.size() > 0) {
         ov::Tensor src_idx_tensor = ov::Tensor(ov::element::i32, {kv_src_indices.size()});
-        for (auto i = 0; i < kv_src_indices.size(); ++i) {
-          src_idx_tensor.data<int32_t>()[i] = int32_t(kv_src_indices[i]);
+        const auto src_idx_ptr = src_idx_tensor.data<int32_t>();
+        for (size_t i = 0; i < kv_src_indices.size(); ++i) {
+          src_idx_ptr[i] = static_cast<int32_t>(kv_src_indices[i]);
         }
         ovInfReq.set_tensor("src_idx", src_idx_tensor);
 
         ov::Tensor dst_idx_tensor = ov::Tensor(ov::element::i32, {1, kv_num_heads, kv_dst_indices.size(), kv_head_size});
-        for (auto i = 0; i < kv_dst_indices.size(); ++i) {
-          for (auto j = 0; j < kv_num_heads; ++j) {
-            for (auto k = 0; k < kv_head_size; ++k) {
-              dst_idx_tensor.data<int32_t>()[(j * kv_dst_indices.size() + i) * kv_head_size + k] = int32_t(kv_dst_indices[i]);
-            }
+        const auto dst_idx_ptr = dst_idx_tensor.data<int32_t>();
+        for (size_t i = 0; i < kv_num_heads; ++i) {
+          for (size_t j = 0; j < kv_dst_indices.size(); ++j) {
+            std::fill_n(dst_idx_ptr + (i * kv_dst_indices.size() + j) * kv_head_size, kv_head_size, kv_dst_indices[j]);
           }
         }
         ovInfReq.set_tensor("dst_idx", dst_idx_tensor);
@@ -536,13 +539,13 @@ void StatefulOVInferRequest::Infer() {
 }
 
 void StatefulOVInferRequest::PostProcessInferRequest() {
-  if(is_support_kvcache_reorder){
+  if(is_fused_kvcache_reorder){
       kv_src_indices.clear();
       kv_dst_indices.clear();
     }
 }
 
-void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) {
+void StatefulOVInferRequest::ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) {
   // Validate input parameters
   if (src_indices.size() != dst_indices.size()) {
     ORT_THROW(log_tag + "ReorderKVCache: src_indices and dst_indices must have the same size. "
@@ -553,12 +556,8 @@ void StatefulOVInferRequest::ReorderKVCache(const std::vector<size_t>& src_indic
   LOGS_DEFAULT(INFO) << log_tag << "ReorderKVCache: Reordering OpenVINO-internal KVCache state with "
                      << src_indices.size() << " index pairs";
 
-  kv_src_indices.clear();
-  kv_dst_indices.clear();
-  for (int i = 0; i < src_indices.size(); ++i) {
-    kv_src_indices.emplace_back(src_indices[i]);
-    kv_dst_indices.emplace_back(dst_indices[i]);
-  }
+  kv_src_indices = src_indices;
+  kv_dst_indices = dst_indices;
 }
 
 void StatefulOVInferRequest::RewindKVCache(size_t index) {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index f2de48cfe35fd..2b61a7d603be6 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -91,10 +91,11 @@ class OVExeNetwork {
   ov::CompiledModel compiled_model_obj;
   std::string target_device;
   bool is_stateful_causallm;
+  bool is_fused_kvcache_reorder = false;
 
  public:
-  explicit OVExeNetwork(ov::CompiledModel compiled_model, std::string device, bool stateful_causallm = false)
-      : compiled_model_obj(std::move(compiled_model)), target_device(std::move(device)), is_stateful_causallm(stateful_causallm) {}
+  explicit OVExeNetwork(ov::CompiledModel compiled_model, std::string device, bool stateful_causallm = false, bool fused_kvcache_reorder = false)
+      : compiled_model_obj(std::move(compiled_model)), target_device(std::move(device)), is_stateful_causallm(stateful_causallm), is_fused_kvcache_reorder(fused_kvcache_reorder) {}
   OVExeNetwork() : compiled_model_obj(ov::CompiledModel()), is_stateful_causallm(false) {}
   ov::CompiledModel& Get() { return compiled_model_obj; }
   std::shared_ptr<OVInferRequest> CreateInferRequest();
@@ -136,16 +137,16 @@ class OVInferRequest {
     return ovInfReq;
   }
   virtual void RewindKVCache([[maybe_unused]] size_t index) {}
-  virtual void ReorderKVCache([[maybe_unused]] const std::vector<size_t>& src_indices,  [[maybe_unused]] const std::vector<size_t>& dst_indices) {}
+  virtual void ReorderKVCache([[maybe_unused]] const std::vector<int32_t>& src_indices, [[maybe_unused]] const std::vector<int32_t>& dst_indices) {}
 };
 
 class StatefulOVInferRequest : public OVInferRequest {
  public:
-  explicit StatefulOVInferRequest(ov::InferRequest infer_request, std::string device);
+  explicit StatefulOVInferRequest(ov::InferRequest infer_request, std::string device, bool fused_kvcache_reorder = false);
 
   void Infer() override;
   void RewindKVCache(size_t index) override;
-  void ReorderKVCache(const std::vector<size_t>& src_indices, const std::vector<size_t>& dst_indices) override;
+  void ReorderKVCache(const std::vector<int32_t>& src_indices, const std::vector<int32_t>& dst_indices) override;
   void FillTensor(const std::string& tensor_name, const ov::element::Type& type,
                   const std::vector<size_t>& shape, int32_t fill_value);
   void CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache);
@@ -158,15 +159,16 @@ class StatefulOVInferRequest : public OVInferRequest {
   void PostProcessInferRequest();
   std::string target_device;
 
-  // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors,
-  // and ensure that full chat history is passed for each prefill call.
-  bool prefill_use_full_chat_history = false;
   std::vector<int64_t> cached_input_ids;
   std::vector<int64_t> cached_position_ids;
+  std::vector<int32_t> kv_src_indices;
+  std::vector<int32_t> kv_dst_indices;
 
-  bool is_support_kvcache_reorder = false;
-  std::vector<int64_t> kv_src_indices;
-  std::vector<int64_t> kv_dst_indices;
+  // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors,
+  // and ensure that full chat history is passed for each prefill call.
+  bool prefill_use_full_chat_history = false;
+  // If fused_kvcache_reorder, will include kv_sec/dst_indices as input
+  bool is_fused_kvcache_reorder = false;
 
   bool IsNPULogitsSliceRequired();
   bool _npu_logits_slice_required = false;
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
index 0b4f828c7cbb6..770c371c399b8 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -76,15 +76,11 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                       std::vector<std::string>& not_kv_inputs,
                       const std::vector<std::string>& key_value_input_names,
                       int gather_dim,
-                      const std::string& device) {
+                      const bool is_fused_kvcache_reorder) {
   if (ModelHasInputOutputNames(ov_model, "beam_idx")) {
     throw std::runtime_error("Model already has fused cache");
   }
 
-  // Flag to add Gather+ScatterElementsUpdate subgraph to reorder KV cache for LLM speculative decoding
-  // TO-DO: extend to NPU device when OpenVINO NPU has related optimization
-  bool is_support_kvcache_reorder = device.find("GPU") != std::string::npos;
-
   // Define input name candidates in priority order
   const std::vector<std::string> input_name_candidates = {
       "inputs_embeds",                       // Default fallback
@@ -107,7 +103,7 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
   std::shared_ptr<ov::opset13::Parameter> src_idx;
   std::shared_ptr<ov::opset13::Parameter> dst_idx;
 
-  if (is_support_kvcache_reorder) {
+  if (is_fused_kvcache_reorder) {
     src_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({update_shape[2]}));
     src_idx->set_friendly_name("src_idx");
     src_idx->output(0).get_tensor().add_names({"src_idx"});
@@ -132,16 +128,16 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                                               ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim}));
 
     std::shared_ptr<ov::Node> output_node;
-    if (is_support_kvcache_reorder) {
+    if (is_fused_kvcache_reorder) {
       auto updatekv_gather_op =
           std::make_shared<ov::opset13::Gather>(gather_op,
                                                 src_idx,
                                                 ov::opset13::Constant::create(ov::element::i64, {}, {2}));
 
       auto updatekv_op = std::make_shared<ov::opset12::ScatterElementsUpdate>(gather_op,
-                                                                               dst_idx,
-                                                                               updatekv_gather_op,
-                                                                               ov::opset13::Constant::create(ov::element::i64, {}, {2}));
+                                                                              dst_idx,
+                                                                              updatekv_gather_op,
+                                                                              ov::opset13::Constant::create(ov::element::i64, {}, {2}));
       output_node = updatekv_op;
     } else {
       output_node = gather_op;
@@ -286,7 +282,7 @@ std::pair<std::vector<std::string>, std::vector<std::string>> ExtractInputKVTens
 }
 
 // Updated PatchStatefulDecoder function
-void PatchStatefulDecoder(std::shared_ptr<ov::Model> model, const std::string& device) {
+void PatchStatefulDecoder(std::shared_ptr<ov::Model> model, const bool is_fused_kvcache_reorder) {
   // Use the dynamic pattern-based extraction logic
   auto [key_value_output_names, extracted_patterns] = ExtractKVPatternsFromOutputs(model);
   auto [key_value_input_names, not_kv_inputs] = ExtractInputKVTensors(model, extracted_patterns);
@@ -308,7 +304,7 @@ void PatchStatefulDecoder(std::shared_ptr<ov::Model> model, const std::string& d
   // batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0
   auto batch_dim = 0;
 
-  FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim, device);
+  FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim, is_fused_kvcache_reorder);
 
   MakeStateful(model, key_value_input_names, key_value_output_names);
 }
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
index ce7db01063426..bfb6224fc8993 100644
--- a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
@@ -27,13 +27,13 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                       std::vector<std::string>& not_kv_inputs,
                       const std::vector<std::string>& key_value_input_names,
                       int gather_dim,
-                      const std::string& device = "");
+                      const bool is_fused_kvcache_reorder = false);
 
 void MakeStateful(std::shared_ptr<ov::Model>& ov_model,
                   const std::vector<std::string>& key_value_input_names,
                   const std::vector<std::string>& key_value_output_names);
 
-void PatchStatefulDecoder(std::shared_ptr<ov::Model> model, const std::string& device = "");
+void PatchStatefulDecoder(std::shared_ptr<ov::Model> model, const bool is_fused_kvcache_reorder = false);
 
 bool HasOpWithType(const std::shared_ptr<const ov::Model>& function, const std::string& type_name);