From 90a14302b60c0600c500abf169a9d0121d14a3a0 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Thu, 4 Dec 2025 18:15:15 -0800
Subject: [PATCH 1/8] Add m_is_full_model in decoder class to identify the
 graph is subgraph or not for fallback using

---
 ggml/src/ggml-openvino/ggml-decoder.cpp               | 9 +++++++++
 ggml/src/ggml-openvino/ggml-decoder.h                 | 6 ++++++
 ggml/src/ggml-openvino/openvino/decoder.hpp           | 2 ++
 ggml/src/ggml-openvino/openvino/translate_session.cpp | 2 +-
 4 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 72f6144708a..2b007e1562d 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -66,6 +66,8 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
         set_input_output(cur_node);
     }
 
+    m_is_full_model = has_inp_tokens && has_output;
+
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
         m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
@@ -150,6 +152,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
         current_node_info.node_inputs[src_name] = src;
         current_node_info.node_inputs_names.push_back(src_name);
 
+        if (src_name == "inp_tokens") {
+            has_inp_tokens = true;
+        }
+
         // Add model inputs
         if (!naive && !src->view_src) {
             ggml_backend_buffer * buffer = src->buffer;
@@ -176,6 +182,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
     if (!naive) {
         // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
         static std::set<std::string> debug_output_names = {};
+        if (node_output_name.find("output") != std::string::npos) {
+            has_output = true;
+        }
         // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
         if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
             node_output_name.find("output") != std::string::npos || debug_output_names.count(node_output_name)) {
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index 111eb7200b8..361a9c9434f 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -202,8 +202,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; }
 
+    virtual bool is_full_model() const override {return m_is_full_model; }
+
     bool m_is_static = false;
     bool m_is_prefill = false;
+    bool m_is_full_model = true;
     int m_prefill_chunk_size = 0;
 
     static std::vector<size_t> get_shape(const ggml_tensor * tensor);
@@ -229,6 +232,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     std::map<std::string, ggml_tensor *> m_model_outputs;
     std::vector<NodeInfo> m_node_info_list;
 
+    bool has_inp_tokens = false;
+    bool has_output = false;
+
     ModelParams m_model_params;
     ComputeParams m_compute_params;
 };
diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp
index 1603c7fd201..71d3c26e9c9 100644
--- a/ggml/src/ggml-openvino/openvino/decoder.hpp
+++ b/ggml/src/ggml-openvino/openvino/decoder.hpp
@@ -53,6 +53,8 @@ class GgmlDecoder : public DecoderBase {
 
     virtual int get_op_case(int node_idx) const = 0;
 
+    virtual bool is_full_model() const = 0;
+
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
index 546778a4707..45fe19d4918 100644
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -190,7 +190,7 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
         }
     };
 
-    if (!m_naive) {
+    if (!m_naive && ggml_model_decoder->is_full_model()) {
         preprocess(*tensor_map, *ggml_model_decoder);
     }
     ggml_model_decoder->visit_subgraph(node_visitor);

From 8422032c280bd7744078785bf1c7f47116bcf13a Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Thu, 4 Dec 2025 21:21:53 -0800
Subject: [PATCH 2/8] Fallback to CPU

---
 ggml/src/ggml-openvino/ggml-decoder.cpp  | 156 ++++++++++++++++++++++-
 ggml/src/ggml-openvino/ggml-decoder.h    |   7 +-
 ggml/src/ggml-openvino/ggml-openvino.cpp |   3 +-
 ggml/src/ggml-openvino/utils.cpp         |   3 +-
 4 files changed, 164 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 2b007e1562d..5c2465fa39c 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -59,7 +59,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
     }
 
     validate_cgraph();
-
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         auto * cur_node = cgraph->nodes[node_n];
         m_nodes.push_back(cur_node);
@@ -67,6 +66,11 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
     }
 
     m_is_full_model = has_inp_tokens && has_output;
+    if (!m_is_full_model) {
+        compute_cgraph_dynamic_dims();
+        add_extra_model_inputs_for_fallback();
+        add_extra_model_outputs_for_fallback();
+    }
 
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
@@ -368,7 +372,7 @@ void GgmlOvDecoder::validate_cgraph() const {
     }
 }
 
-ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
+ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const {
     auto name = std::string(input->name);
     ov::PartialShape input_shape;
 
@@ -400,6 +404,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
     } else {
         input_shape = ov::PartialShape{get_shape(input)};
     }
+    if (dynamic_dim_index != -1) {
+        input_shape[3-dynamic_dim_index] = -1;
+    }
     return input_shape;
 }
 
@@ -872,3 +879,148 @@ const std::string & GgmlOvDecoder::get_op_type() const {
     static const std::string unknown_op = "UNKNOWN_GGML_OP";
     return unknown_op;
 }
+
+void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
+    // lambda递归查找所有输入节点
+    auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
+        if (!node) {
+            return;
+        }
+
+        if (node->op == GGML_OP_CPY) {
+            m_node_dynamic_dims[node] = -1;
+        }
+
+        if (m_node_dynamic_dims.count(node)) {
+            return;
+        }
+        // 这里可以根据实际需求设置dynamic dim，这里用ne[0]举例
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            ggml_tensor * src = node->src[i];
+            if (src) {
+                self(self, src);
+            }
+        }
+        switch (node->op) {
+        case GGML_OP_NONE:
+            m_node_dynamic_dims[node] = -1;
+            if (std::string(node->name) == "inp_tokens" || std::string(node->name) == "inp_pos" ||
+                std::string(node->name) == "inp_out_ids") {
+                m_node_dynamic_dims[node] = 0;
+            }
+            break;
+        case GGML_OP_GET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                m_node_dynamic_dims[node] = 1;
+            }
+            break;
+        case GGML_OP_MUL:
+        case GGML_OP_MUL_MAT:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            }
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
+            }
+            break;
+        case GGML_OP_VIEW:
+        case GGML_OP_FLASH_ATTN_EXT:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
+                int same_dim_count = 0;
+                for (int i = 0; i < 4; i++) {
+                    if (node->ne[i] == dynamic_dim_value) {
+                        m_node_dynamic_dims[node] = i;
+                        same_dim_count++;
+                    }
+                }
+                if (same_dim_count != 1) {
+                    std::cout << "Cannot determine dynamic dim for node: " << node->name << std::endl;
+                }
+            }
+            break;
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_ADD:
+        case GGML_OP_GLU:
+        case GGML_OP_ROPE:
+            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            break;
+        case GGML_OP_CPY:
+        case GGML_OP_SET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            break;
+        default:
+            std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
+            break;
+        }
+    };
+
+    // 对所有节点递归处理
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+        visit_node(visit_node, node);
+    }
+}
+
+void GgmlOvDecoder::add_extra_model_outputs_for_fallback() {
+    std::map<void *, ggml_tensor *> address_map;
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+        address_map[node->data] = node;
+    }
+
+    for (const auto & pair : address_map) {
+        const std::string & name = pair.second->name;
+        if (m_model_outputs.find(name) == m_model_outputs.end() &&
+            name.find("view") == std::string::npos &&
+            name.find("ffn") == std::string::npos) {
+            m_model_outputs[name] = pair.second;
+        }
+    }
+}
+
+void GgmlOvDecoder::add_extra_model_inputs_for_fallback() {
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            auto * src = node->src[i];
+            if (src == nullptr) {
+                continue;
+            }
+            if (src->view_src) {
+                continue;
+            }
+            std::string src_name = std::string(src->name);
+            if (m_model_weights.find(src_name) != m_model_weights.end()) {
+                continue;
+            }
+            // 在m_node_info_list的node_name中找src->name，如果找到了，说明src是中间节点，不是输入节点
+            bool is_intermediate_node = false;
+            for (const auto & node_info : m_node_info_list) {
+                if (node_info.node_name == src_name) {
+                    is_intermediate_node = true;
+                    break;
+                }
+            }
+            if (is_intermediate_node) {
+                continue;
+            }
+            if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
+                continue;
+            }
+
+            m_inputs[src_name] = src;
+            auto param_node = std::make_shared<ov::op::v0::Parameter>(
+                get_ov_type(src), get_graph_input_shape(node, src, m_node_dynamic_dims[src]));
+            param_node->set_friendly_name(src_name);
+            param_node->output(0).get_tensor().set_names({src_name});
+            m_model_inputs[src_name] = param_node;
+        }
+    }
+}
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index 361a9c9434f..befced43608 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -175,7 +175,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual bool is_static() const override { return m_is_static; }
 
-    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
+    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;
 
     static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
 
@@ -208,6 +208,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     bool m_is_prefill = false;
     bool m_is_full_model = true;
     int m_prefill_chunk_size = 0;
+    bool m_xuejun = false;
 
     static std::vector<size_t> get_shape(const ggml_tensor * tensor);
     static std::vector<size_t> get_stride(const ggml_tensor * tensor);
@@ -218,6 +219,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 private:
     void set_input_output(ggml_tensor * node, bool naive = false);
     int compute_op_case(const ggml_tensor * node) const;
+    void compute_cgraph_dynamic_dims();
+    void add_extra_model_outputs_for_fallback();
+    void add_extra_model_inputs_for_fallback();
 
     void validate_cgraph() const;
 
@@ -231,6 +235,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
     std::map<std::string, ggml_tensor *> m_model_outputs;
     std::vector<NodeInfo> m_node_info_list;
+    std::map<ggml_tensor *, int> m_node_dynamic_dims;
 
     bool has_inp_tokens = false;
     bool has_output = false;
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index e809d250f70..da419fc15f4 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -287,6 +287,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
     static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
                                                  GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
                                                  GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
+                                                // GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, GGML_OP_SCALE,
                                                  // softmax is not updated due to replaced by flash_attn_ext
                                                  // GGML_OP_SOFT_MAX,
                                                  GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
@@ -318,7 +319,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
     default: {
         auto supported = supported_ops.find(op->op) != supported_ops.end();
         if (!supported) {
-            GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op));
+            // GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op));
             return false;
         }
     }
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 1f94d4bad60..442521e01e7 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -118,6 +118,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
             auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device));
 
             ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static);
+            ggml_decoder->m_xuejun = true;
             decoder_end_time = ggml_time_us();
 
             auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
@@ -432,7 +433,7 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & dev
 }
 
 bool is_naive(ggml_cgraph * cgraph) {
-    constexpr int naive_graph_size_threshold = 20;
+    constexpr int naive_graph_size_threshold = 0;
     return cgraph->n_nodes < naive_graph_size_threshold;
 }
 

From 9e489b9145ffe6b4b23d312f3533d5a5b6ec284e Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Thu, 4 Dec 2025 22:50:19 -0800
Subject: [PATCH 3/8] clean code

---
 ggml/src/ggml-openvino/ggml-openvino.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index da419fc15f4..e809d250f70 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -287,7 +287,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
     static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
                                                  GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
                                                  GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
-                                                // GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, GGML_OP_SCALE,
                                                  // softmax is not updated due to replaced by flash_attn_ext
                                                  // GGML_OP_SOFT_MAX,
                                                  GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
@@ -319,7 +318,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
     default: {
         auto supported = supported_ops.find(op->op) != supported_ops.end();
         if (!supported) {
-            // GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op));
+            GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op));
             return false;
         }
     }

From a959262cc8f1c856d5d5e18b126426865762f879 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Mon, 8 Dec 2025 17:03:16 -0800
Subject: [PATCH 4/8] clean code

---
 ggml/src/ggml-openvino/ggml-decoder.cpp  | 5 +----
 ggml/src/ggml-openvino/ggml-decoder.h    | 1 -
 ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +-
 ggml/src/ggml-openvino/utils.cpp         | 1 -
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 5c2465fa39c..f2cdbf2f004 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -881,7 +881,6 @@ const std::string & GgmlOvDecoder::get_op_type() const {
 }
 
 void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
-    // lambda递归查找所有输入节点
     auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
         if (!node) {
             return;
@@ -894,7 +893,6 @@ void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
         if (m_node_dynamic_dims.count(node)) {
             return;
         }
-        // 这里可以根据实际需求设置dynamic dim，这里用ne[0]举例
         for (int i = 0; i < GGML_MAX_SRC; i++) {
             ggml_tensor * src = node->src[i];
             if (src) {
@@ -961,7 +959,6 @@ void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
         }
     };
 
-    // 对所有节点递归处理
     for (int i = 0; i < m_cgraph->n_nodes; i++) {
         ggml_tensor * node = m_cgraph->nodes[i];
         visit_node(visit_node, node);
@@ -1000,7 +997,7 @@ void GgmlOvDecoder::add_extra_model_inputs_for_fallback() {
             if (m_model_weights.find(src_name) != m_model_weights.end()) {
                 continue;
             }
-            // 在m_node_info_list的node_name中找src->name，如果找到了，说明src是中间节点，不是输入节点
+
             bool is_intermediate_node = false;
             for (const auto & node_info : m_node_info_list) {
                 if (node_info.node_name == src_name) {
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index befced43608..e8e09b6d8ca 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -208,7 +208,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     bool m_is_prefill = false;
     bool m_is_full_model = true;
     int m_prefill_chunk_size = 0;
-    bool m_xuejun = false;
 
     static std::vector<size_t> get_shape(const ggml_tensor * tensor);
     static std::vector<size_t> get_stride(const ggml_tensor * tensor);
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index e809d250f70..25c854811eb 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -286,7 +286,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
 
     static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
                                                  GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
-                                                 GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
+                                                 GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, GGML_OP_SCALE,
                                                  // softmax is not updated due to replaced by flash_attn_ext
                                                  // GGML_OP_SOFT_MAX,
                                                  GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 442521e01e7..13fea7f0519 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -118,7 +118,6 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
             auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device));
 
             ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static);
-            ggml_decoder->m_xuejun = true;
             decoder_end_time = ggml_time_us();
 
             auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);

From bcb6945b01ff03385748619436b2579b62436fd1 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Mon, 8 Dec 2025 17:36:59 -0800
Subject: [PATCH 5/8] recover code

---
 ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 25c854811eb..e809d250f70 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -286,7 +286,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
 
     static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
                                                  GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
-                                                 GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, GGML_OP_SCALE,
+                                                 GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
                                                  // softmax is not updated due to replaced by flash_attn_ext
                                                  // GGML_OP_SOFT_MAX,
                                                  GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};

From 1504e76645404c8ec69aad33fa0f8329b2838a49 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Mon, 15 Dec 2025 17:34:26 -0800
Subject: [PATCH 6/8] Fix error & get more gerneral

---
 ggml/src/ggml-openvino/ggml-decoder.cpp | 10 +++----
 ggml/src/ggml-openvino/utils.cpp        | 35 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index f2cdbf2f004..22f15c154f8 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -969,14 +969,15 @@ void GgmlOvDecoder::add_extra_model_outputs_for_fallback() {
     std::map<void *, ggml_tensor *> address_map;
     for (int i = 0; i < m_cgraph->n_nodes; i++) {
         ggml_tensor * node = m_cgraph->nodes[i];
+        if (node->op == GGML_OP_VIEW) {
+            continue;
+        }
         address_map[node->data] = node;
     }
 
     for (const auto & pair : address_map) {
         const std::string & name = pair.second->name;
-        if (m_model_outputs.find(name) == m_model_outputs.end() &&
-            name.find("view") == std::string::npos &&
-            name.find("ffn") == std::string::npos) {
+        if (m_model_outputs.find(name) == m_model_outputs.end()) {
             m_model_outputs[name] = pair.second;
         }
     }
@@ -990,9 +991,6 @@ void GgmlOvDecoder::add_extra_model_inputs_for_fallback() {
             if (src == nullptr) {
                 continue;
             }
-            if (src->view_src) {
-                continue;
-            }
             std::string src_name = std::string(src->name);
             if (m_model_weights.find(src_name) != m_model_weights.end()) {
                 continue;
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 13fea7f0519..d1c27c8b081 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -3,6 +3,7 @@
 #include "ggml-impl.h"
 #include "ggml-openvino/ggml-decoder.h"
 #include "ggml.h"
+#include "ggml-cpu.h"
 #include "openvino/frontend.hpp"
 #include "openvino/input_model.hpp"
 
@@ -486,6 +487,40 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
     } else {
         input_shape =  ggml_decoder->get_shape(ggml_tensor);
     }
+
+    // If the tensor is a result of PERMUTE operation, use ggml_cont to make it contiguous
+    if (ggml_tensor->op == GGML_OP_PERMUTE) {
+        // Create a temporary context for ggml_cont operation
+        // Need space for: tensor overhead, tensor data, graph structure, and work buffer
+        size_t mem_size = ggml_tensor_overhead() * 4 + ggml_nbytes(ggml_tensor) * 2 + 1024 * 1024;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/mem_size,
+            /*.mem_buffer =*/NULL,
+            /*.no_alloc   =*/false,
+        };
+        struct ggml_context * temp_ctx = ggml_init(params);
+        if (temp_ctx == NULL) {
+            throw std::runtime_error("Failed to initialize temporary context for PERMUTE");
+        }
+
+        // Create contiguous tensor using ggml_cont
+        struct ggml_tensor * cont_tensor = ggml_cont(temp_ctx, const_cast<struct ggml_tensor *>(ggml_tensor));
+
+        // Build a simple graph to compute ggml_cont
+        struct ggml_cgraph * gf = ggml_new_graph(temp_ctx);
+        ggml_build_forward_expand(gf, cont_tensor);
+        ggml_graph_compute_with_ctx(temp_ctx, gf, 1);
+
+        // Create OpenVINO tensor with contiguous data
+        ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
+        memcpy(input_tensor.data(), cont_tensor->data, ggml_nbytes(cont_tensor));
+
+        // Free temporary context
+        ggml_free(temp_ctx);
+
+        return input_tensor;
+    }
+
     auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
     return input_tensor;
 }

From 54ebe74d6d1dc73dea142e2ecf035e0e0e16e07e Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Tue, 23 Dec 2025 00:33:05 -0800
Subject: [PATCH 7/8] Add function description for fallback mechanisms

---
 ggml/src/ggml-openvino/ggml-decoder.cpp | 58 +++++++++++++++++++++++++
 ggml/src/ggml-openvino/ggml-decoder.h   |  8 +++-
 ggml/src/ggml-openvino/utils.cpp        |  2 +-
 3 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 22f15c154f8..5a78102a022 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -65,6 +65,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
         set_input_output(cur_node);
     }
 
+    // 
     m_is_full_model = has_inp_tokens && has_output;
     if (!m_is_full_model) {
         compute_cgraph_dynamic_dims();
@@ -880,6 +881,27 @@ const std::string & GgmlOvDecoder::get_op_type() const {
     return unknown_op;
 }
 
+/**
+ * @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms.
+ *
+ * This function traverses the computation graph and determines the dynamic dimensions
+ * for each node based on its operation type and dependencies. The dynamic dimension
+ * is stored in the `m_node_dynamic_dims` map, where a value of -1 indicates no dynamic
+ * dimension. Specific operations such as GGML_OP_GET_ROWS, GGML_OP_MUL, GGML_OP_VIEW,
+ * etc., are handled to compute the dynamic dimension index.
+ *
+ * Key behaviors:
+ * - Nodes with operations like GGML_OP_NONE, GGML_OP_GET_ROWS, GGML_OP_MUL, and others
+ *   are analyzed to determine their dynamic dimensions.
+ * - Nodes with specific names (e.g., "inp_tokens", "inp_pos", "inp_out_ids") are
+ *   explicitly assigned a dynamic dimension index of 0.
+ * - For operations like GGML_OP_VIEW and GGML_OP_RESHAPE, the function ensures that
+ *   the dynamic dimension is uniquely determined; otherwise, a warning is printed.
+ * - Unhandled operations print a message indicating the node name and operation type.
+ *
+ * This function is critical for preparing the computation graph for execution, ensuring
+ * that dynamic dimensions are correctly propagated and resolved.
+ */
 void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
     auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
         if (!node) {
@@ -965,6 +987,23 @@ void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
     }
 }
 
+/**
+ * @brief Adds extra model outputs to support fallback mechanisms.
+ *
+ * This function ensures that all relevant nodes in the computation graph are included
+ * as model outputs for fallback scenarios. It creates a mapping of tensor data addresses
+ * to their corresponding nodes, excluding nodes with the GGML_OP_VIEW operation.
+ *
+ * Key behaviors:
+ * - Iterates through all nodes in the computation graph and maps their data addresses
+ *   to the corresponding tensor nodes, skipping nodes with GGML_OP_VIEW.
+ * - Adds nodes to the `m_model_outputs` map if they are not already present, using
+ *   the tensor's name as the key.
+ *
+ * This function is essential for ensuring that fallback mechanisms have access to all
+ * necessary model outputs, particularly in scenarios where certain outputs are not
+ * explicitly defined in the original model configuration.
+ */
 void GgmlOvDecoder::add_extra_model_outputs_for_fallback() {
     std::map<void *, ggml_tensor *> address_map;
     for (int i = 0; i < m_cgraph->n_nodes; i++) {
@@ -983,6 +1022,25 @@ void GgmlOvDecoder::add_extra_model_outputs_for_fallback() {
     }
 }
 
+/**
+* @brief Adds extra model inputs to support fallback mechanisms.
+*
+* This function ensures that all necessary input nodes in the computation graph are
+* included as model inputs for fallback scenarios. It iterates through the source nodes
+* of each computation graph node and adds them to the `m_model_inputs` map if they meet
+* specific criteria.
+*
+* Key behaviors:
+* - Skips source nodes that are already present in `m_model_weights` or `m_model_inputs`.
+* - Excludes intermediate nodes that are part of `m_node_info_list`.
+* - For eligible source nodes, creates OpenVINO parameter nodes with appropriate types
+*   and shapes, and assigns them friendly names.
+* - Updates the `m_inputs` and `m_model_inputs` maps with the new parameter nodes.
+*
+* This function is critical for ensuring that fallback mechanisms have access to all
+* required model inputs, particularly in scenarios where certain inputs are not
+* explicitly defined in the original model configuration.
+*/
 void GgmlOvDecoder::add_extra_model_inputs_for_fallback() {
     for (int i = 0; i < m_cgraph->n_nodes; i++) {
         ggml_tensor * node = m_cgraph->nodes[i];
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index e8e09b6d8ca..341bc768501 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -206,7 +206,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     bool m_is_static = false;
     bool m_is_prefill = false;
-    bool m_is_full_model = true;
+    bool m_is_full_model = true; // label the cgraph is splited or not
     int m_prefill_chunk_size = 0;
 
     static std::vector<size_t> get_shape(const ggml_tensor * tensor);
@@ -218,8 +218,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 private:
     void set_input_output(ggml_tensor * node, bool naive = false);
     int compute_op_case(const ggml_tensor * node) const;
+
+    // @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms.
     void compute_cgraph_dynamic_dims();
+    // @brief Adds extra model outputs to support fallback mechanisms.
     void add_extra_model_outputs_for_fallback();
+    // @brief Adds extra model inputs to support fallback mechanisms.
     void add_extra_model_inputs_for_fallback();
 
     void validate_cgraph() const;
@@ -234,7 +238,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
     std::map<std::string, ggml_tensor *> m_model_outputs;
     std::vector<NodeInfo> m_node_info_list;
-    std::map<ggml_tensor *, int> m_node_dynamic_dims;
+    std::map<ggml_tensor *, int> m_node_dynamic_dims; // map from ggml_tensor to its dynamic dimension index, -1 means static
 
     bool has_inp_tokens = false;
     bool has_output = false;
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index d1c27c8b081..2c8344c6c92 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -489,7 +489,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
     }
 
     // If the tensor is a result of PERMUTE operation, use ggml_cont to make it contiguous
-    if (ggml_tensor->op == GGML_OP_PERMUTE) {
+    if (ggml_tensor->op == GGML_OP_PERMUTE && !ggml_decoder->is_full_model()) {
         // Create a temporary context for ggml_cont operation
         // Need space for: tensor overhead, tensor data, graph structure, and work buffer
         size_t mem_size = ggml_tensor_overhead() * 4 + ggml_nbytes(ggml_tensor) * 2 + 1024 * 1024;

From fdc19db070a4942cb26cbc50bfff2dd4c5fe3066 Mon Sep 17 00:00:00 2001
From: Xuejun Zhai <Xuejun.Zhai@intel.com>
Date: Thu, 25 Dec 2025 23:29:21 -0800
Subject: [PATCH 8/8] Fix view op as sub-cgraph input & the view shape changed
 issue

---
 ggml/src/ggml-openvino/ggml-decoder.cpp |  7 +++--
 ggml/src/ggml-openvino/utils.cpp        | 34 +++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 5a78102a022..20cc02e98ba 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -65,7 +65,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
         set_input_output(cur_node);
     }
 
-    // 
     m_is_full_model = has_inp_tokens && has_output;
     if (!m_is_full_model) {
         compute_cgraph_dynamic_dims();
@@ -278,6 +277,9 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
                 throw std::runtime_error("Unsupported VIEW case");
             }
             op_case = 2;
+            if (!m_is_full_model && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) {
+                op_case = 0;
+            }
         }
         break;
     }
@@ -969,6 +971,7 @@ void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
         case GGML_OP_ADD:
         case GGML_OP_GLU:
         case GGML_OP_ROPE:
+        case GGML_OP_SCALE:
             m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
             break;
         case GGML_OP_CPY:
@@ -1056,7 +1059,7 @@ void GgmlOvDecoder::add_extra_model_inputs_for_fallback() {
 
             bool is_intermediate_node = false;
             for (const auto & node_info : m_node_info_list) {
-                if (node_info.node_name == src_name) {
+                if (node_info.node == src) {
                     is_intermediate_node = true;
                     break;
                 }
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 2c8344c6c92..5e0f5cb097d 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -6,7 +6,6 @@
 #include "ggml-cpu.h"
 #include "openvino/frontend.hpp"
 #include "openvino/input_model.hpp"
-
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -481,7 +480,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
     const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
     auto * input_data = ggml_tensor->data;
     ov::Shape input_shape;
-    if (ggml_tensor->op == GGML_OP_VIEW) {
+    if (0) {
         // This case is added to make test-backend-ops work
         input_shape = ggml_decoder->get_shape(ggml_tensor->view_src);
     } else {
@@ -521,6 +520,37 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
         return input_tensor;
     }
 
+    // If the tensor is a result of VIEW operation, use ggml_cont to make it contiguous
+    if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_full_model()) {
+        // if the ggml_tensor shape size is equal to the source tensor shape size, no need to reconstruct the ov input tensor data
+        if (ggml_nelements(ggml_tensor) == ggml_nelements(ggml_tensor->view_src)) {
+            auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
+            return input_tensor;
+        }
+
+        // Create OpenVINO input tensor, the data need to reconstructed based on the view tensor shape & stride
+        // Todo: parallel copy & the copy the whole last dim one loop (perf improve)
+        ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
+        const auto * src_tensor = ggml_tensor->view_src;
+        size_t des_index = 0;
+        for (size_t i0 = 0; i0 < static_cast<size_t>(ggml_tensor->ne[3]); i0++) {
+            for (size_t i1 = 0; i1 < static_cast<size_t>(ggml_tensor->ne[2]); i1++) {
+                for (size_t i2 = 0; i2 < static_cast<size_t>(ggml_tensor->ne[1]); i2++) {
+                    for (size_t i3 = 0; i3 < static_cast<size_t>(ggml_tensor->ne[0]); i3++) {
+                    size_t src_index = ggml_tensor->view_offs + i0 * ggml_tensor->nb[3] + i1 * ggml_tensor->nb[2] +
+                                       i2 * ggml_tensor->nb[1] + i3 * ggml_tensor->nb[0];
+
+                    memcpy(static_cast<char *>(input_tensor.data()) + des_index,
+                           static_cast<const char *>(src_tensor->data) + src_index, ggml_tensor->nb[0]);
+                    des_index += ggml_tensor->nb[0];
+                }
+            }
+            }
+        }
+        return input_tensor;
+    }
+
+
     auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
     return input_tensor;
 }