ravi9 · wine99 · Dec 18, 2025 · Dec 18, 2025 · Dec 19, 2025 · Dec 22, 2025
diff --git a/docs/build.md b/docs/build.md
@@ -610,10 +610,23 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
       sudo apt-get update
       sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
     ```
+    - OpenCL
+    ```bash
+        sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+    ```
 
 - **Windows:**
-    - Download Microsoft.VisualStudio.2022.BuildTools [Visual_Studio_Build_Tools]https://aka.ms/vs/17/release/vs_BuildTools.exe Select "Desktop development with C++" under workloads.
+    - Download Microsoft.VisualStudio.2022.BuildTools: [Visual_Studio_Build_Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe)
+    Select "Desktop development with C++" under workloads
     - Install git
+    - Install OpenCL with vcpkg
+      ```powershell
+      cd C:\
+      git clone https://github.com/microsoft/vcpkg
+      cd vcpkg
+      bootstrap-vcpkg.bat
+      vcpkg install opencl
+      ```
     - Use "x64 Native Tools Command Prompt" for Build
 
 ### 1. Install OpenVINO Runtime
@@ -625,19 +638,19 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
     <details>
     <summary>📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu</summary>
     <br>
-        
+
     ```bash
     wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
     chmod +x install-openvino-from-archive.sh
     ./install-openvino-from-archive.sh
     ```
+
+    Verify OpenVINO is initialized properly:
+    ```bash
+    echo $OpenVINO_DIR
+    ```
     </details>
 
-    - Verify OpenVINO is initialized properly
-        - **Linux:**
-            ```bash
-            echo $OpenVINO_DIR
-            ```
 
 ### 2. Build llama.cpp with OpenVINO Backend
 
@@ -651,60 +664,41 @@ git switch dev_backend_openvino
 
 - **Linux:**
     ```bash
-    # Build with OpenVINO support
     source /opt/intel/openvino/setupvars.sh
     cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF
-    cmake --build build/ReleaseOV --config Release -j $(nproc)
+    cmake --build build/ReleaseOV --parallel
     ```
 
-- **Windows:** 
+- **Windows:**
     ```bash
-    # Build with OpenVINO support
     "C:\Program Files (x86)\Intel\openvino_2025.3.0\setupvars.bat"
-    cmake -B build/ReleaseOV -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF
-    cmake --build build\ReleaseOV --config Release
-    ```
-    - For faster compilation, add the -- /m argument to run multiple jobs in parallel with as many CPU cores available. 
-    ```bash
-    cmake --build build\ReleaseOV --config Release -- /m
+    cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+    cmake --build build\ReleaseOV --parallel
     ```
 
 ### 3. Download Sample Model
 
 Download models for testing:
 
 ```bash
-# Create models directory
 mkdir -p ~/models/
-
-# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
-wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
-     -O ~/models/Llama-3.2-1B-Instruct.fp16.gguf
-
-# Download model file: Phi-3-mini-4k-instruct-fp16.gguf
-wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \
-     -O ~/models/Phi-3-mini-4k-instruct-fp16.gguf
+wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
+     -O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
 ```
 
 ### 4. Run inference with OpenVINO backend:
 
 When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
 
 ```bash
-export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
-# Default device is GPU.
-# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU.
+# If device is unset or unavailable, default to CPU.
 export GGML_OPENVINO_DEVICE=GPU
-
-./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
-
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
 ```
 
 To run in chat mode:
 ```bash
-export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
-
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
 ```
 
 ### Configuration Options
@@ -716,16 +710,11 @@ Control OpenVINO behavior using these environment variables:
 -   **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling.
 -   **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`.
 -   **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps.
--   **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging.
--   **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging.
 
 ### Example with Profiling
 
 ```bash
-export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
-export GGML_OPENVINO_PROFILING=1
-
-GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
+GGML_OPENVINO_PROFILING=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
 ```
 
 ### Docker build Llama.cpp with OpenVINO Backend
@@ -741,7 +730,7 @@ docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile
 # Build a minimal CLI-only image containing just the llama-cli executable.
 docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
 
-# Builds a server-only image with llama-server executable, health check endpoint, and REST API support. 
+# Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
 docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
 
 # If you are behind a proxy:
@@ -764,17 +753,17 @@ llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
 docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
 --device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
 llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
-``` 
+```
 
 Run Llama.cpp Server with OpenVINO Backend
 ```bash
 # Run the Server Docker container server
-docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf 
+docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
 
 # In a NEW terminal, test the server with curl
 
 # If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
-export NO_PROXY=localhost,127.0.0.1  
+export NO_PROXY=localhost,127.0.0.1
 
 # Test health endpoint
 curl -f http://localhost:8080/health

diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h
@@ -18,9 +18,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
 
 GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
 
+GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);
+
+GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);
+
+GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
+
 // device buffer
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
 
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);
+
 GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
@@ -43,8 +51,10 @@ struct ggml_openvino_device_info {
     std::array<float, GGML_OPENVINO_MAX_DEVICES> default_tensor_split = {};
 };
 
-const ggml_openvino_device_info & ggml_openvino_info();
-
 #ifdef __cplusplus
 }
 #endif
+
+#ifdef __cplusplus
+const ggml_openvino_device_info & ggml_openvino_info();
+#endif
diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt
@@ -1,4 +1,5 @@
 find_package(OpenVINO REQUIRED)
+find_package(OpenCL REQUIRED)
 
 include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
 
@@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino
     ${GGML_HEADERS_OPENVINO}
 )
 
-target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb)
+target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
 
 if (GGML_OPENVINO)
     if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")