From 90a14302b60c0600c500abf169a9d0121d14a3a0 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 18:15:15 -0800 Subject: [PATCH 1/8] Add m_is_full_model in decoder class to identify the graph is subgraph or not for fallback using --- ggml/src/ggml-openvino/ggml-decoder.cpp | 9 +++++++++ ggml/src/ggml-openvino/ggml-decoder.h | 6 ++++++ ggml/src/ggml-openvino/openvino/decoder.hpp | 2 ++ ggml/src/ggml-openvino/openvino/translate_session.cpp | 2 +- 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 72f6144708a..2b007e1562d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -66,6 +66,8 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, set_input_output(cur_node); } + m_is_full_model = has_inp_tokens && has_output; + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node); @@ -150,6 +152,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); + if (src_name == "inp_tokens") { + has_inp_tokens = true; + } + // Add model inputs if (!naive && !src->view_src) { ggml_backend_buffer * buffer = src->buffer; @@ -176,6 +182,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (!naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; + if (node_output_name.find("output") != std::string::npos) { + has_output = true; + } // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_output_name.find("output") != std::string::npos || debug_output_names.count(node_output_name)) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 111eb7200b8..361a9c9434f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -202,8 +202,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; } + virtual bool is_full_model() const override {return m_is_full_model; } + bool m_is_static = false; bool m_is_prefill = false; + bool m_is_full_model = true; int m_prefill_chunk_size = 0; static std::vector get_shape(const ggml_tensor * tensor); @@ -229,6 +232,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map m_model_outputs; std::vector m_node_info_list; + bool has_inp_tokens = false; + bool has_output = false; + ModelParams m_model_params; ComputeParams m_compute_params; }; diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 1603c7fd201..71d3c26e9c9 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -53,6 +53,8 @@ class GgmlDecoder : public DecoderBase { virtual int get_op_case(int node_idx) const = 0; + virtual bool is_full_model() const = 0; + virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 546778a4707..45fe19d4918 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -190,7 +190,7 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } }; - if (!m_naive) { + if (!m_naive && ggml_model_decoder->is_full_model()) { preprocess(*tensor_map, *ggml_model_decoder); } ggml_model_decoder->visit_subgraph(node_visitor); From 8422032c280bd7744078785bf1c7f47116bcf13a Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 21:21:53 -0800 Subject: [PATCH 2/8] Fallback to CPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 156 ++++++++++++++++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 7 +- ggml/src/ggml-openvino/ggml-openvino.cpp | 3 +- ggml/src/ggml-openvino/utils.cpp | 3 +- 4 files changed, 164 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2b007e1562d..5c2465fa39c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -59,7 +59,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, } validate_cgraph(); - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; m_nodes.push_back(cur_node); @@ -67,6 +66,11 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, } m_is_full_model = has_inp_tokens && has_output; + if (!m_is_full_model) { + compute_cgraph_dynamic_dims(); + add_extra_model_inputs_for_fallback(); + add_extra_model_outputs_for_fallback(); + } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); @@ -368,7 +372,7 @@ void GgmlOvDecoder::validate_cgraph() const { } } -ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const { +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const { auto name = std::string(input->name); ov::PartialShape input_shape; @@ -400,6 +404,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } else { input_shape = ov::PartialShape{get_shape(input)}; } + if (dynamic_dim_index != -1) { + input_shape[3-dynamic_dim_index] = -1; + } return input_shape; } @@ -872,3 +879,148 @@ const std::string & GgmlOvDecoder::get_op_type() const { static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; } + +void GgmlOvDecoder::compute_cgraph_dynamic_dims() { + // lambda递归查找所有输入节点 + auto visit_node = [&](auto && self, ggml_tensor * node) -> void { + if (!node) { + return; + } + + if (node->op == GGML_OP_CPY) { + m_node_dynamic_dims[node] = -1; + } + + if (m_node_dynamic_dims.count(node)) { + return; + } + // 这里可以根据实际需求设置dynamic dim,这里用ne[0]举例 + for (int i = 0; i < GGML_MAX_SRC; i++) { + ggml_tensor * src = node->src[i]; + if (src) { + self(self, src); + } + } + switch (node->op) { + case GGML_OP_NONE: + m_node_dynamic_dims[node] = -1; + if (std::string(node->name) == "inp_tokens" || std::string(node->name) == "inp_pos" || + std::string(node->name) == "inp_out_ids") { + m_node_dynamic_dims[node] = 0; + } + break; + case GGML_OP_GET_ROWS: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[1]] != -1) { + m_node_dynamic_dims[node] = 1; + } + break; + case GGML_OP_MUL: + case GGML_OP_MUL_MAT: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + } + if (m_node_dynamic_dims[node->src[1]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]]; + } + break; + case GGML_OP_VIEW: + case GGML_OP_FLASH_ATTN_EXT: + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx]; + int same_dim_count = 0; + for (int i = 0; i < 4; i++) { + if (node->ne[i] == dynamic_dim_value) { + m_node_dynamic_dims[node] = i; + same_dim_count++; + } + } + if (same_dim_count != 1) { + std::cout << "Cannot determine dynamic dim for node: " << node->name << std::endl; + } + } + break; + case GGML_OP_RMS_NORM: + case GGML_OP_ADD: + case GGML_OP_GLU: + case GGML_OP_ROPE: + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + break; + case GGML_OP_CPY: + case GGML_OP_SET_ROWS: + m_node_dynamic_dims[node] = -1; + break; + default: + std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl; + break; + } + }; + + // 对所有节点递归处理 + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + visit_node(visit_node, node); + } +} + +void GgmlOvDecoder::add_extra_model_outputs_for_fallback() { + std::map address_map; + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + address_map[node->data] = node; + } + + for (const auto & pair : address_map) { + const std::string & name = pair.second->name; + if (m_model_outputs.find(name) == m_model_outputs.end() && + name.find("view") == std::string::npos && + name.find("ffn") == std::string::npos) { + m_model_outputs[name] = pair.second; + } + } +} + +void GgmlOvDecoder::add_extra_model_inputs_for_fallback() { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto * src = node->src[i]; + if (src == nullptr) { + continue; + } + if (src->view_src) { + continue; + } + std::string src_name = std::string(src->name); + if (m_model_weights.find(src_name) != m_model_weights.end()) { + continue; + } + // 在m_node_info_list的node_name中找src->name,如果找到了,说明src是中间节点,不是输入节点 + bool is_intermediate_node = false; + for (const auto & node_info : m_node_info_list) { + if (node_info.node_name == src_name) { + is_intermediate_node = true; + break; + } + } + if (is_intermediate_node) { + continue; + } + if (m_model_inputs.find(src_name) != m_model_inputs.end()) { + continue; + } + + m_inputs[src_name] = src; + auto param_node = std::make_shared( + get_ov_type(src), get_graph_input_shape(node, src, m_node_dynamic_dims[src])); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + } + } +} diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 361a9c9434f..befced43608 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -175,7 +175,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool is_static() const override { return m_is_static; } - ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const; + ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -208,6 +208,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { bool m_is_prefill = false; bool m_is_full_model = true; int m_prefill_chunk_size = 0; + bool m_xuejun = false; static std::vector get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); @@ -218,6 +219,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { private: void set_input_output(ggml_tensor * node, bool naive = false); int compute_op_case(const ggml_tensor * node) const; + void compute_cgraph_dynamic_dims(); + void add_extra_model_outputs_for_fallback(); + void add_extra_model_inputs_for_fallback(); void validate_cgraph() const; @@ -231,6 +235,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_weights; std::map m_model_outputs; std::vector m_node_info_list; + std::map m_node_dynamic_dims; bool has_inp_tokens = false; bool has_output = false; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e809d250f70..da419fc15f4 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -287,6 +287,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, + // GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, GGML_OP_SCALE, // softmax is not updated due to replaced by flash_attn_ext // GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; @@ -318,7 +319,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con default: { auto supported = supported_ops.find(op->op) != supported_ops.end(); if (!supported) { - GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); + // GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); return false; } } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 1f94d4bad60..442521e01e7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -118,6 +118,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); + ggml_decoder->m_xuejun = true; decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); @@ -432,7 +433,7 @@ std::map get_types_to_requant(const std::string & dev } bool is_naive(ggml_cgraph * cgraph) { - constexpr int naive_graph_size_threshold = 20; + constexpr int naive_graph_size_threshold = 0; return cgraph->n_nodes < naive_graph_size_threshold; } From 9e489b9145ffe6b4b23d312f3533d5a5b6ec284e Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 4 Dec 2025 22:50:19 -0800 Subject: [PATCH 3/8] clean code --- ggml/src/ggml-openvino/ggml-openvino.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index da419fc15f4..e809d250f70 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -287,7 +287,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, - // GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, GGML_OP_SCALE, // softmax is not updated due to replaced by flash_attn_ext // GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; @@ -319,7 +318,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con default: { auto supported = supported_ops.find(op->op) != supported_ops.end(); if (!supported) { - // GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); + GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); return false; } } From a959262cc8f1c856d5d5e18b126426865762f879 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Mon, 8 Dec 2025 17:03:16 -0800 Subject: [PATCH 4/8] clean code --- ggml/src/ggml-openvino/ggml-decoder.cpp | 5 +---- ggml/src/ggml-openvino/ggml-decoder.h | 1 - ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 1 - 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 5c2465fa39c..f2cdbf2f004 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -881,7 +881,6 @@ const std::string & GgmlOvDecoder::get_op_type() const { } void GgmlOvDecoder::compute_cgraph_dynamic_dims() { - // lambda递归查找所有输入节点 auto visit_node = [&](auto && self, ggml_tensor * node) -> void { if (!node) { return; @@ -894,7 +893,6 @@ void GgmlOvDecoder::compute_cgraph_dynamic_dims() { if (m_node_dynamic_dims.count(node)) { return; } - // 这里可以根据实际需求设置dynamic dim,这里用ne[0]举例 for (int i = 0; i < GGML_MAX_SRC; i++) { ggml_tensor * src = node->src[i]; if (src) { @@ -961,7 +959,6 @@ void GgmlOvDecoder::compute_cgraph_dynamic_dims() { } }; - // 对所有节点递归处理 for (int i = 0; i < m_cgraph->n_nodes; i++) { ggml_tensor * node = m_cgraph->nodes[i]; visit_node(visit_node, node); @@ -1000,7 +997,7 @@ void GgmlOvDecoder::add_extra_model_inputs_for_fallback() { if (m_model_weights.find(src_name) != m_model_weights.end()) { continue; } - // 在m_node_info_list的node_name中找src->name,如果找到了,说明src是中间节点,不是输入节点 + bool is_intermediate_node = false; for (const auto & node_info : m_node_info_list) { if (node_info.node_name == src_name) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index befced43608..e8e09b6d8ca 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -208,7 +208,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { bool m_is_prefill = false; bool m_is_full_model = true; int m_prefill_chunk_size = 0; - bool m_xuejun = false; static std::vector get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e809d250f70..25c854811eb 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -286,7 +286,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, + GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, GGML_OP_SCALE, // softmax is not updated due to replaced by flash_attn_ext // GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 442521e01e7..13fea7f0519 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -118,7 +118,6 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); - ggml_decoder->m_xuejun = true; decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); From bcb6945b01ff03385748619436b2579b62436fd1 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Mon, 8 Dec 2025 17:36:59 -0800 Subject: [PATCH 5/8] recover code --- ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 25c854811eb..e809d250f70 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -286,7 +286,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, GGML_OP_RMS_NORM, GGML_OP_SCALE, + GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, // softmax is not updated due to replaced by flash_attn_ext // GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; From 1504e76645404c8ec69aad33fa0f8329b2838a49 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Mon, 15 Dec 2025 17:34:26 -0800 Subject: [PATCH 6/8] Fix error & get more gerneral --- ggml/src/ggml-openvino/ggml-decoder.cpp | 10 +++---- ggml/src/ggml-openvino/utils.cpp | 35 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index f2cdbf2f004..22f15c154f8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -969,14 +969,15 @@ void GgmlOvDecoder::add_extra_model_outputs_for_fallback() { std::map address_map; for (int i = 0; i < m_cgraph->n_nodes; i++) { ggml_tensor * node = m_cgraph->nodes[i]; + if (node->op == GGML_OP_VIEW) { + continue; + } address_map[node->data] = node; } for (const auto & pair : address_map) { const std::string & name = pair.second->name; - if (m_model_outputs.find(name) == m_model_outputs.end() && - name.find("view") == std::string::npos && - name.find("ffn") == std::string::npos) { + if (m_model_outputs.find(name) == m_model_outputs.end()) { m_model_outputs[name] = pair.second; } } @@ -990,9 +991,6 @@ void GgmlOvDecoder::add_extra_model_inputs_for_fallback() { if (src == nullptr) { continue; } - if (src->view_src) { - continue; - } std::string src_name = std::string(src->name); if (m_model_weights.find(src_name) != m_model_weights.end()) { continue; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 13fea7f0519..d1c27c8b081 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" +#include "ggml-cpu.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" @@ -486,6 +487,40 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, } else { input_shape = ggml_decoder->get_shape(ggml_tensor); } + + // If the tensor is a result of PERMUTE operation, use ggml_cont to make it contiguous + if (ggml_tensor->op == GGML_OP_PERMUTE) { + // Create a temporary context for ggml_cont operation + // Need space for: tensor overhead, tensor data, graph structure, and work buffer + size_t mem_size = ggml_tensor_overhead() * 4 + ggml_nbytes(ggml_tensor) * 2 + 1024 * 1024; + struct ggml_init_params params = { + /*.mem_size =*/mem_size, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/false, + }; + struct ggml_context * temp_ctx = ggml_init(params); + if (temp_ctx == NULL) { + throw std::runtime_error("Failed to initialize temporary context for PERMUTE"); + } + + // Create contiguous tensor using ggml_cont + struct ggml_tensor * cont_tensor = ggml_cont(temp_ctx, const_cast(ggml_tensor)); + + // Build a simple graph to compute ggml_cont + struct ggml_cgraph * gf = ggml_new_graph(temp_ctx); + ggml_build_forward_expand(gf, cont_tensor); + ggml_graph_compute_with_ctx(temp_ctx, gf, 1); + + // Create OpenVINO tensor with contiguous data + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); + memcpy(input_tensor.data(), cont_tensor->data, ggml_nbytes(cont_tensor)); + + // Free temporary context + ggml_free(temp_ctx); + + return input_tensor; + } + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; } From 54ebe74d6d1dc73dea142e2ecf035e0e0e16e07e Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Tue, 23 Dec 2025 00:33:05 -0800 Subject: [PATCH 7/8] Add function description for fallback mechanisms --- ggml/src/ggml-openvino/ggml-decoder.cpp | 58 +++++++++++++++++++++++++ ggml/src/ggml-openvino/ggml-decoder.h | 8 +++- ggml/src/ggml-openvino/utils.cpp | 2 +- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 22f15c154f8..5a78102a022 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -65,6 +65,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, set_input_output(cur_node); } + // m_is_full_model = has_inp_tokens && has_output; if (!m_is_full_model) { compute_cgraph_dynamic_dims(); @@ -880,6 +881,27 @@ const std::string & GgmlOvDecoder::get_op_type() const { return unknown_op; } +/** + * @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms. + * + * This function traverses the computation graph and determines the dynamic dimensions + * for each node based on its operation type and dependencies. The dynamic dimension + * is stored in the `m_node_dynamic_dims` map, where a value of -1 indicates no dynamic + * dimension. Specific operations such as GGML_OP_GET_ROWS, GGML_OP_MUL, GGML_OP_VIEW, + * etc., are handled to compute the dynamic dimension index. + * + * Key behaviors: + * - Nodes with operations like GGML_OP_NONE, GGML_OP_GET_ROWS, GGML_OP_MUL, and others + * are analyzed to determine their dynamic dimensions. + * - Nodes with specific names (e.g., "inp_tokens", "inp_pos", "inp_out_ids") are + * explicitly assigned a dynamic dimension index of 0. + * - For operations like GGML_OP_VIEW and GGML_OP_RESHAPE, the function ensures that + * the dynamic dimension is uniquely determined; otherwise, a warning is printed. + * - Unhandled operations print a message indicating the node name and operation type. + * + * This function is critical for preparing the computation graph for execution, ensuring + * that dynamic dimensions are correctly propagated and resolved. + */ void GgmlOvDecoder::compute_cgraph_dynamic_dims() { auto visit_node = [&](auto && self, ggml_tensor * node) -> void { if (!node) { @@ -965,6 +987,23 @@ void GgmlOvDecoder::compute_cgraph_dynamic_dims() { } } +/** + * @brief Adds extra model outputs to support fallback mechanisms. + * + * This function ensures that all relevant nodes in the computation graph are included + * as model outputs for fallback scenarios. It creates a mapping of tensor data addresses + * to their corresponding nodes, excluding nodes with the GGML_OP_VIEW operation. + * + * Key behaviors: + * - Iterates through all nodes in the computation graph and maps their data addresses + * to the corresponding tensor nodes, skipping nodes with GGML_OP_VIEW. + * - Adds nodes to the `m_model_outputs` map if they are not already present, using + * the tensor's name as the key. + * + * This function is essential for ensuring that fallback mechanisms have access to all + * necessary model outputs, particularly in scenarios where certain outputs are not + * explicitly defined in the original model configuration. + */ void GgmlOvDecoder::add_extra_model_outputs_for_fallback() { std::map address_map; for (int i = 0; i < m_cgraph->n_nodes; i++) { @@ -983,6 +1022,25 @@ void GgmlOvDecoder::add_extra_model_outputs_for_fallback() { } } +/** +* @brief Adds extra model inputs to support fallback mechanisms. +* +* This function ensures that all necessary input nodes in the computation graph are +* included as model inputs for fallback scenarios. It iterates through the source nodes +* of each computation graph node and adds them to the `m_model_inputs` map if they meet +* specific criteria. +* +* Key behaviors: +* - Skips source nodes that are already present in `m_model_weights` or `m_model_inputs`. +* - Excludes intermediate nodes that are part of `m_node_info_list`. +* - For eligible source nodes, creates OpenVINO parameter nodes with appropriate types +* and shapes, and assigns them friendly names. +* - Updates the `m_inputs` and `m_model_inputs` maps with the new parameter nodes. +* +* This function is critical for ensuring that fallback mechanisms have access to all +* required model inputs, particularly in scenarios where certain inputs are not +* explicitly defined in the original model configuration. +*/ void GgmlOvDecoder::add_extra_model_inputs_for_fallback() { for (int i = 0; i < m_cgraph->n_nodes; i++) { ggml_tensor * node = m_cgraph->nodes[i]; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index e8e09b6d8ca..341bc768501 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -206,7 +206,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { bool m_is_static = false; bool m_is_prefill = false; - bool m_is_full_model = true; + bool m_is_full_model = true; // label the cgraph is splited or not int m_prefill_chunk_size = 0; static std::vector get_shape(const ggml_tensor * tensor); @@ -218,8 +218,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { private: void set_input_output(ggml_tensor * node, bool naive = false); int compute_op_case(const ggml_tensor * node) const; + + // @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms. void compute_cgraph_dynamic_dims(); + // @brief Adds extra model outputs to support fallback mechanisms. void add_extra_model_outputs_for_fallback(); + // @brief Adds extra model inputs to support fallback mechanisms. void add_extra_model_inputs_for_fallback(); void validate_cgraph() const; @@ -234,7 +238,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> m_model_weights; std::map m_model_outputs; std::vector m_node_info_list; - std::map m_node_dynamic_dims; + std::map m_node_dynamic_dims; // map from ggml_tensor to its dynamic dimension index, -1 means static bool has_inp_tokens = false; bool has_output = false; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index d1c27c8b081..2c8344c6c92 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -489,7 +489,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, } // If the tensor is a result of PERMUTE operation, use ggml_cont to make it contiguous - if (ggml_tensor->op == GGML_OP_PERMUTE) { + if (ggml_tensor->op == GGML_OP_PERMUTE && !ggml_decoder->is_full_model()) { // Create a temporary context for ggml_cont operation // Need space for: tensor overhead, tensor data, graph structure, and work buffer size_t mem_size = ggml_tensor_overhead() * 4 + ggml_nbytes(ggml_tensor) * 2 + 1024 * 1024; From fdc19db070a4942cb26cbc50bfff2dd4c5fe3066 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 25 Dec 2025 23:29:21 -0800 Subject: [PATCH 8/8] Fix view op as sub-cgraph input & the view shape changed issue --- ggml/src/ggml-openvino/ggml-decoder.cpp | 7 +++-- ggml/src/ggml-openvino/utils.cpp | 34 +++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 5a78102a022..20cc02e98ba 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -65,7 +65,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, set_input_output(cur_node); } - // m_is_full_model = has_inp_tokens && has_output; if (!m_is_full_model) { compute_cgraph_dynamic_dims(); @@ -278,6 +277,9 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { throw std::runtime_error("Unsupported VIEW case"); } op_case = 2; + if (!m_is_full_model && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) { + op_case = 0; + } } break; } @@ -969,6 +971,7 @@ void GgmlOvDecoder::compute_cgraph_dynamic_dims() { case GGML_OP_ADD: case GGML_OP_GLU: case GGML_OP_ROPE: + case GGML_OP_SCALE: m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; break; case GGML_OP_CPY: @@ -1056,7 +1059,7 @@ void GgmlOvDecoder::add_extra_model_inputs_for_fallback() { bool is_intermediate_node = false; for (const auto & node_info : m_node_info_list) { - if (node_info.node_name == src_name) { + if (node_info.node == src) { is_intermediate_node = true; break; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2c8344c6c92..5e0f5cb097d 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -6,7 +6,6 @@ #include "ggml-cpu.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" - #include #include #include @@ -481,7 +480,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); auto * input_data = ggml_tensor->data; ov::Shape input_shape; - if (ggml_tensor->op == GGML_OP_VIEW) { + if (0) { // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { @@ -521,6 +520,37 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, return input_tensor; } + // If the tensor is a result of VIEW operation, use ggml_cont to make it contiguous + if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_full_model()) { + // if the ggml_tensor shape size is equal to the source tensor shape size, no need to reconstruct the ov input tensor data + if (ggml_nelements(ggml_tensor) == ggml_nelements(ggml_tensor->view_src)) { + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); + return input_tensor; + } + + // Create OpenVINO input tensor, the data need to reconstructed based on the view tensor shape & stride + // Todo: parallel copy & the copy the whole last dim one loop (perf improve) + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); + const auto * src_tensor = ggml_tensor->view_src; + size_t des_index = 0; + for (size_t i0 = 0; i0 < static_cast(ggml_tensor->ne[3]); i0++) { + for (size_t i1 = 0; i1 < static_cast(ggml_tensor->ne[2]); i1++) { + for (size_t i2 = 0; i2 < static_cast(ggml_tensor->ne[1]); i2++) { + for (size_t i3 = 0; i3 < static_cast(ggml_tensor->ne[0]); i3++) { + size_t src_index = ggml_tensor->view_offs + i0 * ggml_tensor->nb[3] + i1 * ggml_tensor->nb[2] + + i2 * ggml_tensor->nb[1] + i3 * ggml_tensor->nb[0]; + + memcpy(static_cast(input_tensor.data()) + des_index, + static_cast(src_tensor->data) + src_index, ggml_tensor->nb[0]); + des_index += ggml_tensor->nb[0]; + } + } + } + } + return input_tensor; + } + + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; }