diff --git a/docs/build.md b/docs/build.md index f7e793c155a..9fa6fccad19 100644 --- a/docs/build.md +++ b/docs/build.md @@ -610,10 +610,23 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi sudo apt-get update sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar ``` + - OpenCL + ```bash + sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd + ``` - **Windows:** - - Download Microsoft.VisualStudio.2022.BuildTools [Visual_Studio_Build_Tools]https://aka.ms/vs/17/release/vs_BuildTools.exe Select "Desktop development with C++" under workloads. + - Download Microsoft.VisualStudio.2022.BuildTools: [Visual_Studio_Build_Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe) + Select "Desktop development with C++" under workloads - Install git + - Install OpenCL with vcpkg + ```powershell + cd C:\ + git clone https://github.com/microsoft/vcpkg + cd vcpkg + bootstrap-vcpkg.bat + vcpkg install opencl + ``` - Use "x64 Native Tools Command Prompt" for Build ### 1. Install OpenVINO Runtime @@ -625,19 +638,19 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu
- + ```bash wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh chmod +x install-openvino-from-archive.sh ./install-openvino-from-archive.sh ``` + + Verify OpenVINO is initialized properly: + ```bash + echo $OpenVINO_DIR + ```
- - Verify OpenVINO is initialized properly - - **Linux:** - ```bash - echo $OpenVINO_DIR - ``` ### 2. Build llama.cpp with OpenVINO Backend @@ -651,22 +664,16 @@ git switch dev_backend_openvino - **Linux:** ```bash - # Build with OpenVINO support source /opt/intel/openvino/setupvars.sh cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF - cmake --build build/ReleaseOV --config Release -j $(nproc) + cmake --build build/ReleaseOV --parallel ``` -- **Windows:** +- **Windows:** ```bash - # Build with OpenVINO support "C:\Program Files (x86)\Intel\openvino_2025.3.0\setupvars.bat" - cmake -B build/ReleaseOV -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF - cmake --build build\ReleaseOV --config Release - ``` - - For faster compilation, add the -- /m argument to run multiple jobs in parallel with as many CPU cores available. - ```bash - cmake --build build\ReleaseOV --config Release -- /m + cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake + cmake --build build\ReleaseOV --parallel ``` ### 3. Download Sample Model @@ -674,16 +681,9 @@ git switch dev_backend_openvino Download models for testing: ```bash -# Create models directory mkdir -p ~/models/ - -# Download model file: Llama-3.2-1B-Instruct.fp16.gguf -wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \ - -O ~/models/Llama-3.2-1B-Instruct.fp16.gguf - -# Download model file: Phi-3-mini-4k-instruct-fp16.gguf -wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \ - -O ~/models/Phi-3-mini-4k-instruct-fp16.gguf +wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \ + -O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf ``` ### 4. Run inference with OpenVINO backend: @@ -691,20 +691,14 @@ wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/P When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster. ```bash -export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -# Default device is GPU. -# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. +# If device is unset or unavailable, default to CPU. export GGML_OPENVINO_DEVICE=GPU - -./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " - +./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is " ``` To run in chat mode: ```bash -export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " - +./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf ``` ### Configuration Options @@ -716,16 +710,11 @@ Control OpenVINO behavior using these environment variables: - **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling. - **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`. - **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps. -- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging. -- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging. ### Example with Profiling ```bash -export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache -export GGML_OPENVINO_PROFILING=1 - -GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " +GGML_OPENVINO_PROFILING=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is " ``` ### Docker build Llama.cpp with OpenVINO Backend @@ -741,7 +730,7 @@ docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile # Build a minimal CLI-only image containing just the llama-cli executable. docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile . -# Builds a server-only image with llama-server executable, health check endpoint, and REST API support. +# Builds a server-only image with llama-server executable, health check endpoint, and REST API support. docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile . # If you are behind a proxy: @@ -764,17 +753,17 @@ llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \ --device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf -``` +``` Run Llama.cpp Server with OpenVINO Backend ```bash # Run the Server Docker container server -docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf +docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf # In a NEW terminal, test the server with curl # If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost -export NO_PROXY=localhost,127.0.0.1 +export NO_PROXY=localhost,127.0.0.1 # Test health endpoint curl -f http://localhost:8080/health diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index b690a16378e..46c1485f663 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -18,9 +18,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device); GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer); + +GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft); + +GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft); + // device buffer GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device); + GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void); @@ -43,8 +51,10 @@ struct ggml_openvino_device_info { std::array default_tensor_split = {}; }; -const ggml_openvino_device_info & ggml_openvino_info(); - #ifdef __cplusplus } #endif + +#ifdef __cplusplus +const ggml_openvino_device_info & ggml_openvino_info(); +#endif diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 3051a8b2405..175b585661d 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -1,4 +1,5 @@ find_package(OpenVINO REQUIRED) +find_package(OpenCL REQUIRED) include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake") @@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino ${GGML_HEADERS_OPENVINO} ) -target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb) +target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL) if (GGML_OPENVINO) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 275a8a216ae..51fb433410c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -2,6 +2,8 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-openvino-extra.h" +#include "ggml-openvino.h" #include "ggml-quants.hpp" #include @@ -17,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +36,7 @@ #include #include #include +#include #include GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, @@ -292,6 +296,9 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { auto * cache_k_perm = node->src[1]; + if (cache_k_perm->op == GGML_OP_CPY) { + cache_k_perm = cache_k_perm->src[0]; + } assert(cache_k_perm->op == GGML_OP_PERMUTE); auto * cache_k_view = cache_k_perm->src[0]; assert(cache_k_view->op == GGML_OP_VIEW); @@ -468,9 +475,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name // return kv_param_res_names; // } -std::map> GgmlOvDecoder::create_weight_nodes( - ggml_cgraph * cgraph, - std::map types_to_requantize) { +std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) { std::map> model_weights; static std::mutex weights_mutex; auto * nodes = cgraph->nodes; @@ -495,10 +500,7 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto requant_type = types_to_requantize.count(src->type) ? - std::optional(types_to_requantize.at(src->type)) : - std::nullopt; - auto weight_node = create_weight_node(src, requant_type); + auto weight_node = create_weight_node(src); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -512,94 +514,70 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, - std::optional requant_type) { - std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; - if (weight_types.find(tensor->type) == weight_types.end()) { - throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + - ggml_type_name(tensor->type)); - } - - auto node_type = get_ov_type(tensor); - auto node_shape = get_shape(tensor); - auto ne_total = ggml_nelements(tensor); +// Static cache for quantized weight nodes (keyed by tensor data pointer) +// This is a fallback for when tensors don't have pre-built constants in extra +static std::unordered_map> s_quantized_weight_cache; +static std::mutex s_quantized_weight_cache_mutex; - OPENVINO_ASSERT(node_shape[0] == 1, "Got 4D weights, expect all weights to be 2D: ", tensor->name); - node_shape.erase(node_shape.begin()); - OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); - node_shape.erase(node_shape.begin()); - - // F16 and F32 case - if (node_type != ov::element::dynamic) { - ov::Tensor weights(node_type, node_shape); - memcpy(weights.data(), tensor->data, ne_total * node_type.size()); - std::shared_ptr weight_node = std::make_shared(weights); - // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU - // if (node_type == ov::element::f16) { - // weight_node = std::make_shared(weight_node, ov::element::f32); - // } - weight_node->set_friendly_name(tensor->name); - return weight_node; +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { + // Check if we have a pre-built constant from the OpenVINO backend buffer + // This is set during ggml_backend_openvino_buffer_set_tensor + if (tensor->extra) { + if (!ggml_backend_buffer_is_openvino(tensor->buffer)) { + OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a cpu backend repacked quantized weights"); + } + // Cast to our extra base type and check the type + auto * extra_base = static_cast(tensor->extra); + + if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) { + // F16/F32/BF16 weight with shared-memory constant + auto * weight_extra = static_cast(tensor->extra); + if (weight_extra->constant) { + GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name); + return weight_extra->constant; + } + } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) { + // Quantized weight with pre-extracted data + auto * quant_extra = static_cast(tensor->extra); + if (quant_extra->constant) { + GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name); + return quant_extra->constant; + } + } } - // Quantized case - OPENVINO_ASSERT(tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + - " Possibly this is a repacked quantized weights"); - - if (requant_type.has_value()) { - return requantize(tensor, requant_type.value()); + // Fallback: Check static cache for quantized weights (keyed by data pointer) + // This handles cases where tensors weren't loaded through OpenVINO buffer + if (ggml_is_quantized(tensor->type)) { + std::lock_guard lock(s_quantized_weight_cache_mutex); + auto it = s_quantized_weight_cache.find(tensor->data); + if (it != s_quantized_weight_cache.end()) { + GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name); + return it->second; + } } - ov::element::Type weight_type; - if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { - weight_type = ov::element::u4; - } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K || tensor.type == GGUF_TYPE_Q5_K - weight_type = ov::element::u8; - } + GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra); - uint64_t weights_per_block; - // here we only consider sub block, q6k:16 q4k:32 q5k:32 - if (tensor->type == GGML_TYPE_Q6_K) { - weights_per_block = 16; - } else { - weights_per_block = 32; + std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; + if (weight_types.find(tensor->type) == weight_types.end()) { + throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + + ggml_type_name(tensor->type)); } - OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, "[load_gguf] tensor ", tensor->name, - " has incompatible last dim shape: ", node_shape.back()); - - ov::Tensor weights(weight_type, node_shape); - // For scales and biases - node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; - ov::Tensor scales(ov::element::f16, node_shape); - ov::Tensor biases(ov::element::f16, node_shape); + std::shared_ptr result = process_weight_tensor(tensor, tensor->data, nullptr); + result->set_friendly_name(tensor->name); - ov::Output weight_node; - if (tensor->type == GGML_TYPE_Q4_0) { - extract_q4_0_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q4_1) { - extract_q4_1_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q8_0) { - extract_q8_0_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q6_K) { - extract_q6_k_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q4_K) { - extract_q4_k_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q5_K) { - extract_q5_k_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + // Cache the quantized weight node for future reuse + if (ggml_is_quantized(tensor->type)) { + std::lock_guard lock(s_quantized_weight_cache_mutex); + s_quantized_weight_cache[tensor->data] = result; + GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name); } - OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); - - weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); - return weight_node.get_node_shared_ptr(); + return result; } void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) { @@ -618,11 +596,19 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(20) << "op" << std::setw(20) << "name" << std::setw(3) << " " - << std::setw(50) << "stride" + << std::setw(62) << "stride" + << std::setw(20) << "buffer_type" << "\n"; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; + // Get buffer type name + const char * buf_name = "none"; + ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer; + if (buf) { + buf_name = ggml_backend_buffer_name(buf); + } + file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << ", " @@ -635,10 +621,18 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(5) << node->nb[1] << ", " << std::setw(5) << node->nb[2] << ", " << std::setw(5) << node->nb[3] << "] " + << std::right << std::setw(15) << buf_name << std::right << "\n"; for (int i = 0; i < GGML_MAX_SRC; i++) { if (auto* src = node->src[i]) { + // Get buffer type name for source + const char * src_buf_name = "none"; + ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer; + if (src_buf) { + src_buf_name = ggml_backend_buffer_name(src_buf); + } + file << std::setw(10) << " [ " << std::setw(5) << src->ne[0] << ", " << std::setw(5) << src->ne[1] << ", " @@ -652,6 +646,7 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(5) << src->nb[1] << ", " << std::setw(5) << src->nb[2] << ", " << std::setw(5) << src->nb[3] << "] " + << std::right << std::setw(15) << src_buf_name << std::right << "\n"; } } @@ -661,11 +656,19 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena for (int i = 0; i < cgraph->n_leafs; i++) { ggml_tensor * node = cgraph->leafs[i]; + // Get buffer type name for leaf + const char * leaf_buf_name = "none"; + ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer; + if (leaf_buf) { + leaf_buf_name = ggml_backend_buffer_name(leaf_buf); + } + file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << "] " << std::setw(8) << ggml_op_name(node->op) << " " - << std::setw(16) << ggml_get_name(node) << "\n"; + << std::setw(16) << ggml_get_name(node) + << std::setw(20) << leaf_buf_name << "\n"; } // clang-format on file << "========================================\n"; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index edcd0367854..0b302b9320b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -179,12 +179,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); - static std::shared_ptr create_weight_node(ggml_tensor * tensor, - std::optional requant_type = std::nullopt); + static std::shared_ptr create_weight_node(ggml_tensor * tensor); - static std::map> create_weight_nodes( - ggml_cgraph * cgraph, - std::map types_to_requantize = {}); + static std::map> create_weight_nodes(ggml_cgraph * cgraph); const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp new file mode 100644 index 00000000000..35d3d93cfd1 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -0,0 +1,365 @@ +#include "ggml-openvino-extra.h" + +#include "ggml-impl.h" +#include "ggml.h" + +#include +#include +#include + +ov::Core & ov_singleton_core() { + static ov::Core core; + return core; +} + +// ===================================================== +// Device Configuration Implementations +// ===================================================== + +void ggml_openvino_device_config::init() { + if (initialized) { + return; + } + device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; + auto available_devices = ov_singleton_core().get_available_devices(); + if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) { + GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str()); + device_name = "CPU"; + } + is_npu = (device_name == "NPU"); + + auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); + if (device_name == "NPU") { + compile_config = { + {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared"}, + {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_DQ_FULL", "NO" }, + }; + if (cache_dir) { + compile_config["NPUW_CACHE_DIR"] = cache_dir; + } + } else if (cache_dir) { + ov_singleton_core().set_property(ov::cache_dir(cache_dir)); + } + + // Initialize remote context with queue sharing for GPU + if (device_name == "GPU") { + // Create OpenCL context and queue + cl_int err; + cl_platform_id platform; + err = clGetPlatformIDs(1, &platform, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to get OpenCL platform: %d\n", err); + return; + } + + cl_device_id cl_device; + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to get OpenCL device: %d\n", err); + return; + } + + cl_context cl_ctx = clCreateContext(nullptr, 1, &cl_device, nullptr, nullptr, &err); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to create OpenCL context: %d\n", err); + return; + } + + cl_queue = clCreateCommandQueueWithProperties(cl_ctx, cl_device, nullptr, &err); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("Failed to create OpenCL command queue: %d\n", err); + clReleaseContext(cl_ctx); + return; + } + + // Create OpenVINO remote context with queue sharing + remote_context = ov::intel_gpu::ocl::ClContext(ov_singleton_core(), cl_queue); + + // Release the context (queue keeps a reference) + clReleaseContext(cl_ctx); + } else if (device_name == "NPU") { + remote_context = ov_singleton_core().get_default_context(device_name); + } + + initialized = true; +} + +ggml_openvino_device_config::~ggml_openvino_device_config() { + if (cl_queue != nullptr) { + clReleaseCommandQueue(cl_queue); + cl_queue = nullptr; + } +} + +// Get the global device config singleton +ggml_openvino_device_config & ggml_openvino_get_device_config() { + static ggml_openvino_device_config config; + return config; +} + +// Initialize device config (call during backend init) +void ggml_openvino_init_device_config() { + ggml_openvino_get_device_config().init(); +} + +// Get the device name +const std::string & ggml_openvino_get_device_name() { + return ggml_openvino_get_device_config().device_name; +} + +// Check if running on NPU +bool ggml_openvino_is_npu() { + return ggml_openvino_get_device_config().is_npu; +} + +// Get the remote context for the current device (returns empty optional for CPU) +std::optional ggml_openvino_get_remote_context() { + return ggml_openvino_get_device_config().remote_context; +} + +// Get the compile config for the current device +const ov::AnyMap & ggml_openvino_get_compile_config() { + return ggml_openvino_get_device_config().compile_config; +} + +// Get the OpenCL command queue for GPU operations +cl_command_queue ggml_openvino_get_cl_queue() { + return ggml_openvino_get_device_config().cl_queue; +} + +// Get the clEnqueueMemFillINTEL function pointer (lazy load) +clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL() { + static clEnqueueMemFillINTEL_fn fn = nullptr; + static bool loaded = false; + if (!loaded) { + loaded = true; + cl_platform_id platform; + if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) { + fn = (clEnqueueMemFillINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL"); + } + } + return fn; +} + +// Get the clEnqueueMemcpyINTEL function pointer (lazy load) +clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { + static clEnqueueMemcpyINTEL_fn fn = nullptr; + static bool loaded = false; + if (!loaded) { + loaded = true; + cl_platform_id platform; + if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) { + fn = (clEnqueueMemcpyINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL"); + } + } + return fn; +} + +// Get requantization type for a tensor type (returns nullopt if no requant needed) +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { + if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { + return ExtraQuantType::Q8_0_C; + } + if (strncmp(tensor->name, "output.weight", 13) == 0) { + return ExtraQuantType::Q8_0_C; + } + if (ggml_openvino_is_npu()) { + return ExtraQuantType::Q4_0_128; + } + switch (tensor->type) { + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q5_K: + return ExtraQuantType::Q8_0_C; + default: + return std::nullopt; + } +} + +// ===================================================== +// Extracted Layout Calculation +// ===================================================== + +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { + ggml_openvino_extracted_layout layout = {}; + layout.is_symmetric = false; + + if (!ggml_is_quantized(tensor->type)) { + return layout; + } + + // Only handle 2D weight tensors + if (tensor->ne[2] != 1 || tensor->ne[3] != 1) { + return layout; + } + + int64_t n_elements = ggml_nelements(tensor); + const size_t alignment = 64; // Good for SIMD + + // Check if requantization is needed (NPU-specific) + auto requant_type = ggml_openvino_get_requant_type(tensor); + if (requant_type.has_value()) { + layout.is_requant = true; + layout.requant_type = requant_type; + + // Special case: requant to F16 - just store F16 weights, no scales/biases + if (requant_type.value() == ExtraQuantType::F16) { + layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes + layout.total_size = layout.weights_size; + layout.weights_offset = 0; + // No scales/biases for F16 + return layout; + } + + // Requant to different quantized format (e.g., Q4_0_128) + switch (requant_type.value()) { + case ExtraQuantType::Q4_0_128: + layout.is_u4 = true; + layout.weights_per_block = 128; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q4_0_C: + layout.is_u4 = true; + layout.weights_per_block = tensor->ne[0]; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q8_0_32: + layout.is_u4 = false; + layout.weights_per_block = 32; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q8_0_C: + layout.is_u4 = false; + layout.weights_per_block = tensor->ne[0]; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q8_1_C: + layout.is_u4 = false; + layout.weights_per_block = tensor->ne[0]; + break; + default: + layout.weights_per_block = -1; + GGML_ABORT("Code of re-quantizing to channel-wise is not updated"); + break; + } + + if (layout.is_requant) { + // Calculate sizes for requantized format + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); + // For symmetric quantization, we only need one bias value (not one per block) + layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t); + + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = + layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); + return layout; + } + } + + // Normal extraction (no requant) - determine format based on tensor type + switch (tensor->type) { + case GGML_TYPE_Q4_0: + layout.is_u4 = true; + layout.weights_per_block = 32; + layout.is_symmetric = true; + break; + case GGML_TYPE_Q4_1: + layout.is_u4 = true; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q4_K: + layout.is_u4 = true; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q8_0: + layout.is_u4 = false; + layout.weights_per_block = 32; + layout.is_symmetric = true; + break; + case GGML_TYPE_Q6_K: + layout.is_u4 = false; + layout.weights_per_block = 16; + layout.is_symmetric = true; + break; + case GGML_TYPE_Q5_K: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported quantization type + return layout; + } + + // Calculate sizes + // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + + // Scales and biases: F16 per block + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + // For symmetric quantization, we only need one bias value (not one per block) + layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t); + + // Layout in buffer: [weights | scales | biases] with alignment + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + + return layout; +} + +ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote) { + ov::Shape shape; + for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { + shape.push_back(static_cast(tensor->ne[i])); + } + + ov::element::Type element_type; + switch (tensor->type) { + case GGML_TYPE_F32: + element_type = ov::element::f32; + break; + case GGML_TYPE_F16: + element_type = ov::element::f16; + break; + case GGML_TYPE_BF16: + element_type = ov::element::bf16; + break; + case GGML_TYPE_I32: + element_type = ov::element::i32; + break; + case GGML_TYPE_I64: + element_type = ov::element::i64; + break; + default: + GGML_LOG_ERROR("%s: unsupported tensor type for ov::Tensor: %s\n", __func__, ggml_type_name(tensor->type)); + return nullptr; + } + + const auto & device_name = ggml_openvino_get_device_name(); + auto remote_context = ggml_openvino_get_remote_context(); + + std::shared_ptr ov_tensor; + if (is_remote) { + GGML_ASSERT(device_name == "GPU"); + auto gpu_context = remote_context->as(); + auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data); + ov_tensor = std::make_shared(std::move(usm_tensor)); + } else { + ov_tensor = std::make_shared(element_type, shape, tensor->data); + } + + return new ggml_openvino_tensor_extra(ov_tensor); +} diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h new file mode 100644 index 00000000000..e2c5a8ceeae --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -0,0 +1,159 @@ +#pragma once + +#include "ggml.h" +#include "openvino/runtime/core.hpp" + +#define CL_TARGET_OPENCL_VERSION 300 +#include + +#include +#include +#include +#include +#include +#include +#include + +// ExtraQuantType enum - defines requantization target formats +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; + +ov::Core & ov_singleton_core(); + +// Get the remote context for the current device (returns empty optional for CPU) +std::optional ggml_openvino_get_remote_context(); + +// Get the compile config for the current device +const ov::AnyMap & ggml_openvino_get_compile_config(); + +// Get the OpenCL command queue for GPU operations (returns nullptr for CPU/NPU) +cl_command_queue ggml_openvino_get_cl_queue(); + +// Intel USM extension function type +typedef cl_int(CL_API_CALL * clEnqueueMemFillINTEL_fn)(cl_command_queue queue, + void * dst_ptr, + const void * pattern, + size_t pattern_size, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + +typedef cl_int(CL_API_CALL * clEnqueueMemcpyINTEL_fn)(cl_command_queue queue, + cl_bool blocking, + void * dst_ptr, + const void * src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + +// Get the clEnqueueMemFillINTEL function pointer (returns nullptr if not available) +clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL(); + +// Get the clEnqueueMemcpyINTEL function pointer (returns nullptr if not available) +clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL(); + +// ===================================================== +// Global Device Configuration (singleton) +// ===================================================== +// Initialized once during backend init from GGML_OPENVINO_DEVICE env var + +struct ggml_openvino_device_config { + std::string device_name = "CPU"; + bool is_npu = false; + bool initialized = false; + std::optional remote_context; + ov::AnyMap compile_config; + cl_command_queue cl_queue = nullptr; + + void init(); + ~ggml_openvino_device_config(); +}; + +// Get the global device config singleton +ggml_openvino_device_config & ggml_openvino_get_device_config(); + +// Initialize device config (call during backend init) +void ggml_openvino_init_device_config(); + +// Get the device name +const std::string & ggml_openvino_get_device_name(); + +// Check if running on NPU +bool ggml_openvino_is_npu(); + +// Get requantization type for a tensor type (returns nullopt if no requant needed) +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor); + +// ===================================================== +// OpenVINO Tensor Extra Types +// ===================================================== +// These types are stored in tensor->extra by the OpenVINO backend buffer. +// They allow: +// 1. Pre-built ov::Constant nodes for weights (avoiding memcpy during graph construction) +// 2. ov::Tensor wrappers for KV cache / compute tensors (for direct use with infer_request) + +// Base class for OpenVINO tensor extra data +struct ggml_openvino_extra_base { + enum class Type { WEIGHT, QUANTIZED_WEIGHT, TENSOR }; + Type type; + virtual ~ggml_openvino_extra_base() = default; +protected: + explicit ggml_openvino_extra_base(Type t) : type(t) {} +}; + +// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node +struct ggml_openvino_weight_extra : public ggml_openvino_extra_base { + std::shared_ptr constant; // Pre-built OpenVINO Constant node + + explicit ggml_openvino_weight_extra(std::shared_ptr c) + : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {} +}; + +// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant +struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base { + ov::Tensor weights; // U4 or U8 extracted weights + ov::Tensor scales; // F16 scales + ov::Tensor biases; // F16 biases (zero points) + std::shared_ptr constant; // Pre-built OpenVINO weight subgraph + + ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr c) + : ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT), + weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {} +}; + +// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request +struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base { + std::shared_ptr tensor; // For direct use with infer_request + + explicit ggml_openvino_tensor_extra(std::shared_ptr t) + : ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {} +}; + +// ===================================================== +// Extracted Size Calculation for Quantized Tensors +// ===================================================== +// For quantized tensors, we need extra space to store extracted weights, scales, and biases. +// Returns the total size needed in the buffer for extracted data. + +struct ggml_openvino_extracted_layout { + size_t total_size; // Total bytes needed + size_t weights_offset; // Offset to weights in buffer + size_t weights_size; // Size of weights in bytes + size_t scales_offset; // Offset to scales in buffer + size_t scales_size; // Size of scales in bytes + size_t biases_offset; // Offset to biases in buffer + size_t biases_size; // Size of biases in bytes + bool is_u4; // true for U4 weights, false for U8 + int64_t weights_per_block;// weights per scale/bias block + bool is_symmetric; // true for symmetric quantization + + // Requantization info + bool is_requant; // true if this tensor needs requantization + std::optional requant_type; // target requant type if is_requant +}; + +// Calculate the buffer layout for extracted quantized data +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); + +ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e809d250f70..a1b5b5dd321 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -3,17 +3,593 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-impl.h" +#include "ggml-openvino-extra.h" #include "ggml-openvino/utils.h" +#include "ggml-quants.hpp" #include "ggml.h" #include +#include +#include #include +#include #include +#include +#include +#include +#include #include #include #include -#define GGML_OPENVINO_MAX_STREAMS 8 +// ===================================================== +// OpenVINO Buffer Implementation using ov::Tensor +// ===================================================== +// +// Design: This implementation uses a hybrid approach: +// 1. For weight tensors: Store a pre-built ov::op::v0::Constant in tensor->extra +// - This avoids the memcpy during graph construction +// - For quantized weights, the constant is already converted to OpenVINO format +// 2. For KV cache / compute tensors: Store an ov::Tensor in tensor->extra +// - This can be directly passed to infer_request +// - Future: can be changed to ov::RemoteTensor for GPU/NPU +// +// This design is similar to: +// - CUDA split buffer: tensor->extra stores device pointers +// - CPU repack buffer: tensor->extra stores tensor_traits with repacked data +// ===================================================== + +// Buffer context that manages per-tensor allocations (no contiguous buffer for weights) +struct ggml_backend_openvino_buffer_context { + int device; + std::string name; + + // For non-weight buffers (KV cache, compute), we still use contiguous allocation + void * data; + size_t size; + bool is_remote; + + // Wrapping of the buffer + std::shared_ptr ov_buffer; + + // Track all extras for cleanup + std::map tensor_extras; + + // Used for re-allocation on device for kvcache + void * data_prev; + + ggml_backend_openvino_buffer_context(int device, size_t size, bool is_remote = false) : + device(device), + name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), + data(nullptr), + size(size), + is_remote(is_remote) { + if (size == 0) { + return; + } + + const auto & device_name = ggml_openvino_get_device_name(); + + if (is_remote) { + GGML_ASSERT(device_name == "GPU"); + auto remote_context = ggml_openvino_get_remote_context(); + auto gpu_context = remote_context->as(); + ov::intel_gpu::ocl::USMTensor usm_tensor = + gpu_context.create_usm_device_tensor(ov::element::u8, ov::Shape{size}); + data = usm_tensor.get(); + ov_buffer = std::make_shared(std::move(usm_tensor)); + } else { + data = ggml_aligned_malloc(size); + ov_buffer = std::make_shared(ov::element::u8, ov::Shape{size}, data); + } + + if (data == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size); + return; + } + + if (reinterpret_cast(data) % TENSOR_ALIGNMENT != 0) { + GGML_LOG_ERROR("%s: %s buffer is not aligned to %d bytes\n", __func__, device_name.c_str(), + TENSOR_ALIGNMENT); + GGML_ABORT("fatal error"); + } + } + + ~ggml_backend_openvino_buffer_context() { + // Clean up all tensor extras + for (auto & pair : tensor_extras) { + delete pair.second; + } + tensor_extras.clear(); + if (!is_remote && data != nullptr) { + ggml_aligned_free(data, size); + } + } +}; + +// Buffer type context (per-device) +struct ggml_backend_openvino_buffer_type_context { + int device; + std::string name; +}; + +// Buffer interface functions +static void ggml_backend_openvino_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + delete ctx; +} + +static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + return ctx->data; +} + +static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache) + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && + ggml_openvino_get_device_name() == "GPU") { + GGML_ASSERT(ctx->tensor_extras.empty()); + auto device = ctx->device; + auto size = ctx->size; + auto * data_prev = ctx->data; + delete ctx; + ctx = new ggml_backend_openvino_buffer_context(device, size, true); + buffer->context = ctx; + tensor->data = (char *) ctx->data + ((char *) tensor->data - (char *) data_prev); + } + + // Views share the extra from view_src + if (tensor->view_src != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + if (tensor->view_src->extra != nullptr) { + tensor->extra = tensor->view_src->extra; + } + return GGML_STATUS_SUCCESS; + } + + ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (tensor->data != nullptr) { + ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor, ctx->is_remote); + if (extra != nullptr) { + auto it = ctx->tensor_extras.find(tensor); + if (it != ctx->tensor_extras.end()) { + delete it->second; + } + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; + } + } + + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + uint8_t value, + size_t offset, + size_t size) { + GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memfill + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); + if (queue != nullptr && mem_fill_fn != nullptr) { + uint8_t pattern = value; + cl_int err = mem_fill_fn(queue, (char *) tensor->data + offset, &pattern, sizeof(pattern), size, 0, nullptr, + nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err); + } + clFinish(queue); + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer\n", __func__); + } + } else { + memset((char *) tensor->data + offset, value, size); + } +} + +static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + // Check if this is a weight buffer (usage is set BEFORE set_tensor is called) + bool is_weight_buffer = (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + // Full tensor set: offset=0, full size, not a view + bool is_full_tensor_set = (offset == 0 && size == ggml_nbytes(tensor) && tensor->view_src == nullptr); + // 2D tensor (typical weight shape) + bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1); + + // Check if this is a quantized weight tensor that needs extraction/requantization + ggml_openvino_extracted_layout layout = {}; + if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) { + layout = ggml_openvino_get_extracted_layout(tensor); + } + + if (layout.total_size > 0) { + // Quantized weight tensor with extraction/requantization + uint8_t * buf_base = (uint8_t *) tensor->data; + + try { + std::shared_ptr constant = process_weight_tensor(tensor, data, buf_base); + constant->set_friendly_name(tensor->name); + + // Store in tensor->extra + if (layout.is_requant && layout.requant_type.has_value() && + layout.requant_type.value() == ExtraQuantType::F16) { + // F16 requant case - use weight_extra + auto * extra = new ggml_openvino_weight_extra(constant); + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; + GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); + } else { + // Quantized case - use quantized_weight_extra + // Create tensors with external memory (already filled by process_weight_tensor) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; + ov::Shape scale_shape = {static_cast(tensor->ne[1]), + static_cast(tensor->ne[0] / layout.weights_per_block)}; + + ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); + ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + + auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), + std::move(biases), constant); + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; + + if (layout.is_requant) { + GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, + layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", + layout.is_u4 ? 4 : 8, layout.weights_per_block); + } else { + int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; + GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, + tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + } + } + + } catch (const std::exception & e) { + GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what()); + // Fall back to storing raw data + memcpy((char *) tensor->data + offset, data, size); + } + } else if (is_weight_buffer && is_full_tensor_set && is_2d && + (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) { + // F16/F32/BF16 weight tensor + try { + std::shared_ptr constant = process_weight_tensor(tensor, data, tensor->data); + constant->set_friendly_name(tensor->name); + + // Store in tensor->extra + ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant); + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name); + + } catch (const std::exception & e) { + GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name, + e.what()); + } + } else { + // Non-weight tensor (KV cache, activations, etc.) - copy data + if (ctx->is_remote) { + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue != nullptr && mem_cpy_fn != nullptr) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, (char *) tensor->data + offset, data, size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err); + } + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + } + } else { + memcpy((char *) tensor->data + offset, data, size); + } + + ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor, ctx->is_remote); + if (extra == nullptr) { + GGML_LOG_ERROR("%s: failed to create tensor extra for %s\n", __func__, tensor->name); + return; + } + + auto it = ctx->tensor_extras.find(tensor); + if (it != ctx->tensor_extras.end()) { + delete it->second; + } + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; + } +} + +static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memcpy (device-to-host) + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue != nullptr && mem_cpy_fn != nullptr) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, data, (const char *) tensor->data + offset, size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL failed with error %d\n", __func__, err); + } + } else { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + } + } else { + memcpy(data, (const char *) tensor->data + offset, size); + } +} + +static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * src, + ggml_tensor * dst) { + // GGML_LOG_DEBUG("%s: src tensor name=%s, dst tensor name=%s\n", __func__, src->name, dst->name); + GGML_ASSERT(src != nullptr && dst != nullptr); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (ctx->is_remote) { + // For remote (device) buffers, use OpenCL USM memcpy + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_cpy_fn = ggml_openvino_get_clEnqueueMemcpyINTEL(); + if (queue == nullptr || mem_cpy_fn == nullptr) { + GGML_LOG_ERROR("%s: no OpenCL queue or clEnqueueMemcpyINTEL not available for GPU buffer\n", __func__); + return false; + } + // Can copy from host to device + if (ggml_backend_buffer_is_host(src->buffer)) { + cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (host-to-device) failed with error %d\n", __func__, err); + return false; + } + return true; + } + // Can also copy from device to device if both are OpenVINO remote buffers + if (ggml_backend_buffer_is_openvino(src->buffer)) { + ggml_backend_openvino_buffer_context * src_ctx = + (ggml_backend_openvino_buffer_context *) src->buffer->context; + if (src_ctx->is_remote) { + cl_int err = + mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__, + err); + return false; + } + return true; + } + } + return false; + } + + // Host buffer - can copy from any host buffer + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + return false; +} + +static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + GGML_ASSERT(ctx->data != nullptr); + if (ctx->is_remote) { + cl_command_queue queue = ggml_openvino_get_cl_queue(); + auto mem_fill_fn = ggml_openvino_get_clEnqueueMemFillINTEL(); + if (queue != nullptr && mem_fill_fn != nullptr) { + uint8_t pattern = value; + cl_int err = mem_fill_fn(queue, ctx->data, &pattern, sizeof(pattern), ctx->size, 0, nullptr, nullptr); + if (err != CL_SUCCESS) { + GGML_LOG_WARN("%s: clEnqueueMemFillINTEL failed with error %d\n", __func__, err); + } + clFinish(queue); + } else { + GGML_LOG_WARN("%s: no OpenCL queue or clEnqueueMemFillINTEL not available for GPU buffer clear\n", + __func__); + } + } else { + memset(ctx->data, value, ctx->size); + } +} + +static const ggml_backend_buffer_i ggml_backend_openvino_buffer_interface = { + /* .free_buffer = */ ggml_backend_openvino_buffer_free_buffer, + /* .get_base = */ ggml_backend_openvino_buffer_get_base, + /* .init_tensor = */ ggml_backend_openvino_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_openvino_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_openvino_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_openvino_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_openvino_buffer_cpy_tensor, + /* .clear = */ ggml_backend_openvino_buffer_clear, + /* .reset = */ NULL, +}; + +// Buffer type interface functions +static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + return ctx->name.c_str(); +} + +static ggml_backend_buffer_t ggml_backend_openvino_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_openvino_buffer_type_context * buft_ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + + // Create buffer context with contiguous memory allocation + ggml_backend_openvino_buffer_context * ctx = new ggml_backend_openvino_buffer_context(buft_ctx->device, size); + + if (ctx->data == nullptr && size > 0) { + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); + delete ctx; + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_openvino_buffer_interface, ctx, size); +} + +static size_t ggml_backend_openvino_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return TENSOR_ALIGNMENT; +} + +static size_t ggml_backend_openvino_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return SIZE_MAX; +} + +static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const ggml_tensor * tensor) { + GGML_UNUSED(buft); + + // For quantized 2D tensors (weights), we need extra space for extracted data + if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) { + ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor); + if (layout.total_size > 0) { + GGML_LOG_DEBUG( + "%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n", + __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size, + layout.biases_size); + return layout.total_size; + } + } + + return ggml_nbytes(tensor); +} + +static bool ggml_backend_openvino_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + // Currently using host memory via ov::Tensor + // This will be false when using GPU/NPU remote tensors + return true; +} + +static const ggml_backend_buffer_type_i ggml_backend_openvino_buffer_type_interface = { + /* .get_name = */ ggml_backend_openvino_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_openvino_buffer_type_is_host, +}; + +// Get buffer type for a specific device +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { + GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count()); + + static std::mutex mutex; + std::lock_guard lock(mutex); + + static std::vector buffer_types; + static std::vector buffer_type_contexts; + + if (buffer_types.empty()) { + int device_count = ggml_backend_openvino_get_device_count(); + buffer_types.resize(device_count); + buffer_type_contexts.resize(device_count); + + for (int i = 0; i < device_count; i++) { + buffer_type_contexts[i].device = i; + buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i); + + buffer_types[i] = ggml_backend_buffer_type{ + /* .iface = */ ggml_backend_openvino_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i), + /* .context = */ &buffer_type_contexts[i], + }; + } + } + + return &buffer_types[device]; +} + +// ===================================================== +// OpenVINO Host Buffer Implementation +// ===================================================== + +static const char * ggml_backend_openvino_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + static std::string name; + name = ctx->name + "_HOST"; + return name.c_str(); +} + +static const ggml_backend_buffer_type_i ggml_backend_openvino_host_buffer_type_interface = { + /* .get_name = */ ggml_backend_openvino_host_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_openvino_buffer_type_is_host, +}; + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device) { + GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count()); + + static std::mutex mutex; + std::lock_guard lock(mutex); + + static std::vector buffer_types; + static std::vector buffer_type_contexts; + + if (buffer_types.empty()) { + int device_count = ggml_backend_openvino_get_device_count(); + buffer_types.resize(device_count); + buffer_type_contexts.resize(device_count); + + for (int i = 0; i < device_count; i++) { + buffer_type_contexts[i].device = i; + buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i); + + buffer_types[i] = ggml_backend_buffer_type{ + /* .iface = */ ggml_backend_openvino_host_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i), + /* .context = */ &buffer_type_contexts[i], + }; + } + } + + return &buffer_types[device]; +} + +bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { + return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; +} + +bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; +} + +bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_host_buffer_type_get_name; +} + +// ===================================================== +// OpenVINO Backend Context and Interface +// ===================================================== struct ggml_backend_openvino_context { int device; // the device ID currently in use @@ -111,13 +687,6 @@ GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_openvino_guid()); } -// device buffer -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { - GGML_ASSERT(device >= 0); - return ggml_backend_cpu_buffer_type(); - GGML_UNUSED(device); -} - struct ggml_backend_openvino_device_context { int device; std::string name; @@ -172,6 +741,11 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(g return ggml_backend_openvino_buffer_type(ctx->device); } +static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; + return ggml_backend_openvino_host_buffer_type(ctx->device); +} + static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { case GGML_OP_SOFT_MAX: { @@ -350,7 +924,9 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); + // Support our own buffer type and any host buffer (for mmap'd files, etc.) + return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft); + // return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft); GGML_UNUSED(dev); } @@ -362,7 +938,8 @@ static const struct ggml_backend_device_i ggml_backend_openvino_device_interface /* .get_props = */ ggml_backend_openvino_device_get_props, /* .init_backend = */ ggml_backend_openvino_device_init, /* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, + // /* .get_host_buffer_type = */ NULL, + /* .get_host_buffer_type = */ ggml_backend_openvino_device_get_host_buffer_type, /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_openvino_device_supports_op, /* .supports_buft = */ ggml_backend_openvino_device_supports_buft, @@ -410,6 +987,10 @@ static int get_openvino_device_count() { } static ggml_openvino_device_info ggml_openvino_init() { + // Initialize device config singleton from env var + ggml_openvino_init_device_config(); + GGML_LOG_INFO("OpenVINO: using device %s\n", ggml_openvino_get_device_name().c_str()); + ggml_openvino_device_info info = {}; info.device_count = get_openvino_device_count(); return info; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 2076c3c75d3..8946b73a561 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -55,9 +55,18 @@ void extract_q4_0_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); - biases[i] = ov::float16(-8.f * static_cast(scales[i])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.f * static_cast(scales[0])); + } + } else { + biases[i] = ov::float16(-8.f * static_cast(scales[i])); + } unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); }); } @@ -95,10 +104,19 @@ void extract_q8_0_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); - biases[i] = ov::float16(-128.f * static_cast(scales[i])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-128.f * static_cast(scales[0])); + } + } else { + biases[i] = ov::float16(-128.f * static_cast(scales[i])); + } for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. // Original data is in int8_t, so we add a bias of -128 and invert the first bit. @@ -190,6 +208,8 @@ void extract_q6_k_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; @@ -199,7 +219,14 @@ void extract_q6_k_data(const ggml_tensor * tensor, for (size_t j = 0; j < 16; j++) { scales[j + i * 16] = ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); - biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0 && j == 0) { + biases[0] = ov::float16(-32.f * static_cast(scales[0])); + } + } else { + biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + } } uint8_t * ql = block_data; @@ -302,15 +329,22 @@ ov::Output make_int8_weights(ov::Tensor & weight, // Expand dimensions for scales and biases auto scale_shape = scales.get_shape(); + auto bias_shape = biases.get_shape(); + bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; if (packed_shape[1] == 1) { + // Requantized channel-wise case packed_shape.erase(packed_shape.begin() + 1); } else { scale_shape.push_back(1); scales.set_shape(scale_shape); - biases.set_shape(scale_shape); + // For symmetric quantization, biases remain scalar (don't resize) + if (!is_scalar_bias) { + bias_shape = scale_shape; + biases.set_shape(bias_shape); + } } // Create graph nodes @@ -318,15 +352,23 @@ ov::Output make_int8_weights(ov::Tensor & weight, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); - ov::Tensor biases_u8(ov::element::u8, scale_shape); + ov::Tensor biases_u8(ov::element::u8, is_scalar_bias ? ov::Shape{} : scale_shape); // Calculate zero point const ov::float16 * bias_data = biases.data::value_type>(); const ov::float16 * scale_data = scales.data::value_type>(); uint8_t * bias_u8_data = biases_u8.data(); - for (size_t i = 0; i < biases_u8.get_size(); ++i) { - bias_u8_data[i] = - (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + + if (is_scalar_bias) { + // Symmetric quantization: single bias value for all blocks + // For Q8_0, bias = -128 * scale, so zero_point = 128 + bias_u8_data[0] = (uint8_t) std::round(-1.f * static_cast(bias_data[0]) / static_cast(scale_data[0])); + } else { + // Asymmetric quantization: per-block biases + for (size_t i = 0; i < biases_u8.get_size(); ++i) { + bias_u8_data[i] = + (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + } } auto zero_point = std::make_shared(biases_u8); @@ -361,17 +403,23 @@ ov::Output make_int4_weights(ov::Tensor & weight, // Expand dimensions for scales and biases ov::Shape scale_bias_shape = scales.get_shape(); + auto bias_shape = biases.get_shape(); + bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization // Create INT4 weight tensor ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; - // Requantized channel-wise case if (packed_shape[1] == 1) { + // Requantized channel-wise case packed_shape.erase(packed_shape.begin() + 1); } else { scale_bias_shape.push_back(1); scales.set_shape(scale_bias_shape); - biases.set_shape(scale_bias_shape); + // For symmetric quantization, biases remain scalar (don't resize) + if (!is_scalar_bias) { + bias_shape = scale_bias_shape; + biases.set_shape(bias_shape); + } } auto weights_node = std::make_shared(ov::element::u4, packed_shape, @@ -382,14 +430,23 @@ ov::Output make_int4_weights(ov::Tensor & weight, // Pack zero points: two subsequent values into one const ov::float16 * bias_data = biases.data::value_type>(); const ov::float16 * scale_data = scales.data::value_type>(); - ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape); + ov::Tensor zero_point_tensor(ov::element::u4, is_scalar_bias ? ov::Shape{} : scale_bias_shape); uint8_t * zero_point_data = static_cast(zero_point_tensor.data()); - for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { - uint8_t bias1 = - (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); - uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / - static_cast(scale_data[i * 2 + 1])); - zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + + if (is_scalar_bias) { + // Symmetric quantization: single bias value for all blocks + // For Q4_0, bias = -8 * scale, so zero_point = 8 + uint8_t zp = (uint8_t) std::round(-1.f * static_cast(bias_data[0]) / static_cast(scale_data[0])); + zero_point_data[0] = (zp << 4) | (zp & 0x0F); + } else { + // Asymmetric quantization: per-block biases + for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { + uint8_t bias1 = + (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); + uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / + static_cast(scale_data[i * 2 + 1])); + zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + } } auto zero_points_node = std::make_shared(zero_point_tensor); @@ -418,56 +475,231 @@ ov::Output make_int4_weights(ov::Tensor & weight, return std::make_shared(w_zp_s, ov::element::f32); } -std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { - std::vector weights_f32(tensor->ne[0] * tensor->ne[1]); - ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); +// Extract quantized weights from tensor and create weight subgraph +std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, + const void * data, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases) { + // Create a temporary tensor for extraction functions that read from tensor->data + ggml_tensor temp_tensor = *tensor; + temp_tensor.data = const_cast(data); + + // Determine block size based on tensor type + int64_t weights_per_block; + bool is_u4; + switch (tensor->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + is_u4 = true; + weights_per_block = 32; + break; + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q5_K: + is_u4 = false; + weights_per_block = 32; + break; + case GGML_TYPE_Q6_K: + is_u4 = false; + weights_per_block = 16; + break; + default: + throw std::runtime_error("Unsupported quantized type for extraction: " + + std::string(ggml_type_name(tensor->type))); + } - std::shared_ptr weight_node; - ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; + // Extract quantized data + switch (tensor->type) { + case GGML_TYPE_Q4_0: + extract_q4_0_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q4_1: + extract_q4_1_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q4_K: + extract_q4_k_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q8_0: + extract_q8_0_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q6_K: + extract_q6_k_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q5_K: + extract_q5_k_data(&temp_tensor, weights, scales, biases); + break; + default: + throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type))); + } - // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k) - // (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0) - std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; - if (device == "NPU" && std::string(tensor->name) == "token_embd.weight") { - requant_type = ExtraQuantType::F16; + // Create the OpenVINO weight subgraph + ov::Output weight_node; + if (is_u4) { + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } else { + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } + auto result = weight_node.get_node_shared_ptr(); + result->set_friendly_name(tensor->name); + return result; +} + +// Requantize weights to target format, writing to provided buffers +std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, + const void * data, + ExtraQuantType requant_type, + int64_t block_size, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases) { + int64_t n_elements = ggml_nelements(tensor); + + // First dequantize to F32 + std::vector weights_f32(n_elements); + ggml_get_type_traits(tensor->type)->to_float(data, weights_f32.data(), n_elements); + + // Handle F16 case - just convert and create constant if (requant_type == ExtraQuantType::F16) { - ov::Tensor weights(ov::element::f16, node_shape); - ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); - std::shared_ptr weight_node = std::make_shared(weights); - weight_node->set_friendly_name(tensor->name); - return weight_node; + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), n_elements); + auto result = std::make_shared(weights); + result->set_friendly_name(tensor->name); + return result; + } + + // Requantize to target quantized format + bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128); + + if (is_u4) { + quantize_q4_0(weights_f32.data(), weights, scales, biases, n_elements, block_size); + } else if (requant_type == ExtraQuantType::Q8_1_C) { + quantize_q8_1(weights_f32.data(), weights, scales, biases, n_elements, block_size); + } else { + quantize_q8_0(weights_f32.data(), weights, scales, biases, n_elements, block_size); } - int64_t block_size = node_shape[1]; - if (requant_type == ExtraQuantType::Q4_0_128) { - block_size = 128; - } else if (requant_type == ExtraQuantType::Q8_0_32) { - block_size = 32; + // Create the OpenVINO weight subgraph + ov::Output weight_node; + if (is_u4) { + weight_node = make_int4_weights(weights, scales, biases, block_size); + } else { + weight_node = make_int8_weights(weights, scales, biases, block_size); } - auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; - ov::Tensor weights; - ov::Tensor scales(ov::element::f16, scales_shape); - ov::Tensor bias(ov::element::f16, scales_shape); + auto result = weight_node.get_node_shared_ptr(); + result->set_friendly_name(tensor->name); + return result; +} - if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) { - weights = ov::Tensor(ov::element::u4, node_shape); - quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q8_1_C) { - weights = ov::Tensor(ov::element::u8, node_shape); - quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) { - weights = ov::Tensor(ov::element::u8, node_shape); - quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); +std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { + GGML_ASSERT(tensor != nullptr); + GGML_ASSERT(data != nullptr); + + // Get 2D shape for weights [rows, cols] + ov::Shape node_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; + + // Handle F16/F32/BF16 weights + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { + ov::element::Type element_type; + switch (tensor->type) { + case GGML_TYPE_F32: + element_type = ov::element::f32; + break; + case GGML_TYPE_F16: + element_type = ov::element::f16; + break; + case GGML_TYPE_BF16: + element_type = ov::element::bf16; + break; + default: + OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path"); + } + + if (output_base_ptr) { + // Using external buffer - copy data and create shared-memory constant + size_t tensor_bytes = ggml_nbytes(tensor); + memcpy(output_base_ptr, data, tensor_bytes); + ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr); + return std::make_shared(ov_tensor); + } else { + // Allocate internal buffer + ov::Tensor weights(element_type, node_shape); + memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size()); + return std::make_shared(weights); + } + } + + // Handle quantized weights + if (!ggml_is_quantized(tensor->type)) { + OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type)); + } + + auto layout = ggml_openvino_get_extracted_layout(tensor); + if (layout.total_size == 0) { + OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type)); + } + + std::shared_ptr result; + + if (layout.is_requant && layout.requant_type.has_value()) { + // Requantization path + if (layout.requant_type.value() == ExtraQuantType::F16) { + // Requant to F16 + ov::Tensor weights; + if (output_base_ptr) { + weights = ov::Tensor(ov::element::f16, node_shape, + static_cast(output_base_ptr) + layout.weights_offset); + } else { + weights = ov::Tensor(ov::element::f16, node_shape); + } + ov::Tensor dummy_scales, dummy_biases; // Not used for F16 + result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_biases); + } else { + // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + // For symmetric quantization, biases are a single value instead of per-block + ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; + + ov::Tensor weights, scales, biases; + if (output_base_ptr) { + uint8_t * buf_base = static_cast(output_base_ptr); + weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); + scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset); + } else { + weights = ov::Tensor(weight_type, node_shape); + scales = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, bias_shape); + } + + result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights, + scales, biases); + } + } else { + // Normal extraction path (no requant) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + // For symmetric quantization, biases are a single value instead of per-block + ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; + + ov::Tensor weights, scales, biases; + if (output_base_ptr) { + uint8_t * buf_base = static_cast(output_base_ptr); + weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); + scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset); + } else { + weights = ov::Tensor(weight_type, node_shape); + scales = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, bias_shape); + } + + result = extract_quantized_weights(tensor, data, weights, scales, biases); } - weight_node->set_friendly_name(tensor->name); - return weight_node; + return result; } void quantize_q4_0(const float * x, @@ -482,6 +714,8 @@ void quantize_q4_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max float max = 0.0f; @@ -498,7 +732,13 @@ void quantize_q4_0(const float * x, if (d == 0) { scales[i] = ov::float16(1.0f); - biases[i] = ov::float16(-8.0f); + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.0f); + } + } else { + biases[i] = ov::float16(-8.0f); + } uint8_t zp = 8; memset(weights + i * qk / 2, zp | (zp << 4), qk / 2); continue; @@ -506,7 +746,14 @@ void quantize_q4_0(const float * x, const float id = 1.0f / d; scales[i] = ov::float16(d); - biases[i] = ov::float16(-8.f * d); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.f * d); + } + } else { + biases[i] = ov::float16(-8.f * d); + } for (int j = 0; j < qk / 2; ++j) { const float x0 = x[i * qk + 2 * j] * id; @@ -530,6 +777,8 @@ void quantize_q8_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -543,7 +792,14 @@ void quantize_q8_0(const float * x, const float d = amax / 127.0f; const float id = d ? 1.0f / d : 0.0f; scales[i] = ov::float16(d); - biases[i] = ov::float16(-128.0f * d); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-128.0f * d); + } + } else { + biases[i] = ov::float16(-128.0f * d); + } for (int j = 0; j < qk; ++j) { const float x0 = x[i * qk + j] * id; diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 71ae317a39e..a1334e2408d 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,10 +1,11 @@ #pragma once +#include "ggml-openvino-extra.h" // For ExtraQuantType +#include "ggml.h" + #include #include #include -#include "ggml.h" - void unpack_32_4(const uint8_t* data, uint8_t* dst); void extract_q4_0_data(const ggml_tensor* tensor, @@ -51,9 +52,37 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; - -std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); +// Extract quantized weights from tensor and create weight subgraph +// If weights/scales/biases are provided (non-empty), uses them as output buffers +// Otherwise allocates new ov::Tensors internally +// Returns the weight node (make_int4_weights or make_int8_weights result) +std::shared_ptr extract_quantized_weights( + const ggml_tensor * tensor, + const void * data, // Source data pointer (may differ from tensor->data) + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases); + +// Requantize weights from tensor to target format, writing to provided buffers +// For F16 target, only weights buffer is used (scales/biases ignored) +// Returns the weight node +std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, + const void * data, // Source data pointer + ExtraQuantType requant_type, + int64_t block_size, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases); + +// Process weight tensor and create an OpenVINO constant node +// Handles F16/F32/BF16 and quantized weights, with optional requantization +// If output_base_ptr is nullptr, allocates internal buffers (for decoder use) +// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use) +// Returns the weight constant node +std::shared_ptr process_weight_tensor( + const ggml_tensor * tensor, + const void * data, // Source data pointer (may differ from tensor->data) + void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation) void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 836e366fd7f..89cf51f8801 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,6 +1,7 @@ #include "utils.h" #include "ggml-impl.h" +#include "ggml-openvino-extra.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.hpp" @@ -36,32 +37,22 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" -static ov::Core core; - enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { - auto get_device = [&] { - std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; - auto available_devices = core.get_available_devices(); - if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) { - GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str()); - device = "CPU"; - } - return device; - }; - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; + std::string filename = "cgraph_ov.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } - static const auto device = get_device(); - static const auto is_static = device == "NPU" ? true : false; + // Use device from singleton (initialized during backend init) + const auto & device = ggml_openvino_get_device_name(); + const auto is_static = ggml_openvino_is_npu(); return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device); } enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device) { + auto & core = ov_singleton_core(); + const auto & config = ggml_openvino_get_compile_config(); static auto is_static = false; - static auto config = get_ov_compile_config(device); // if (is_naive(cgraph)) { // return naive_compute(cgraph, core, device, config); @@ -115,7 +106,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin infer_request_cache.erase(key); std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); decoder_end_time = ggml_time_us(); @@ -132,7 +123,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ov::serialize(model, timestamped_filename); } - auto compiled_model = core.compile_model(model, device, config); + ov::CompiledModel compiled_model; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + compiled_model = core.compile_model(model, remote_context.value(), config); + } else { + compiled_model = core.compile_model(model, device, config); + } compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); infer_request_cache[key] = infer_request; @@ -181,18 +178,20 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); - GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); + GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } - GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { + auto & core = ov_singleton_core(); + auto get_prefill_chunk_size = [] { const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); if (chunk_size_str && atoi(chunk_size_str) > 0) { @@ -204,7 +203,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { static std::string device = "NPU"; static auto is_static = true; static auto prefill_chunk_size = get_prefill_chunk_size(); - static auto config = get_ov_compile_config(device); + const auto & config = ggml_openvino_get_compile_config(); if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); @@ -263,7 +262,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { infer_request_cache_prefill.erase(key); std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, true, prefill_chunk_size); @@ -289,8 +288,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { ov::serialize(model_decode, timestamped_filename); } - auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); - auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); + ov::CompiledModel compiled_model_prefill; + ov::CompiledModel compiled_model_decode; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + compiled_model_prefill = core.compile_model(model_prefill, remote_context.value(), config); + compiled_model_decode = core.compile_model(model_decode, remote_context.value(), config); + } else { + compiled_model_prefill = core.compile_model(model_prefill, device, config); + compiled_model_decode = core.compile_model(model_decode, device, config); + } infer_request_cache_prefill[key] = std::make_shared(compiled_model_prefill.create_infer_request()); @@ -377,54 +384,17 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); - GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); + GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } - GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } -ov::AnyMap get_ov_compile_config(const std::string & device) { - ov::AnyMap config; - auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); - if (device == "NPU") { - config = { - {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, - {"NPU_USE_NPUW", "YES" }, - {"NPUW_DEVICES", "NPU" }, - {"NPUW_FOLD", "YES" }, - {"NPUW_WEIGHTS_BANK", "shared"}, - {"NPUW_FUNCALL_FOR_ALL", "YES" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, - {"NPUW_DQ", "YES" }, - {"NPUW_DQ_FULL", "NO" }, - }; - if (cache_dir) { - config["NPUW_CACHE_DIR"] = cache_dir; - } - } else if (cache_dir) { - core.set_property(ov::cache_dir(cache_dir)); - } - return config; -} - -std::map get_types_to_requant(const std::string & device) { - if (device == "NPU") { - return { - {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q6_K, ExtraQuantType::F16 }, - {GGML_TYPE_Q5_K, ExtraQuantType::F16 }, - }; - } - return {}; -} - bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; @@ -449,7 +419,14 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, if (getenv("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); } - auto infer_request = core.compile_model(model, device, config).create_infer_request(); + + ov::InferRequest infer_request; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + infer_request = core.compile_model(model, remote_context.value(), config).create_infer_request(); + } else { + infer_request = core.compile_model(model, device, config).create_infer_request(); + } auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { @@ -472,6 +449,18 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, namespace { ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + + if (ggml_tensor->extra != nullptr) { + // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str()); + auto * extra_base = static_cast(ggml_tensor->extra); + if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) { + throw std::runtime_error("ggml tensor extra is not of type TENSOR for input: " + name); + } + auto * tensor_extra = static_cast(extra_base); + return *tensor_extra->tensor; + } + + // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str()); auto * input_data = ggml_tensor->data; ov::Shape input_shape; if (ggml_tensor->op == GGML_OP_VIEW) { diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 85bb3a2f882..44ca2db00fa 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -71,10 +71,6 @@ bool get_is_prefill(const ggml_tensor * inp_pos); graph_key compute_graph_key(struct ggml_cgraph * cgraph); -ov::AnyMap get_ov_compile_config(const std::string & device); - -std::map get_types_to_requant(const std::string & device); - ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name);