diff --git a/docs/build.md b/docs/build.md
index f7e793c155a..9fa6fccad19 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -610,10 +610,23 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
```
+ - OpenCL
+ ```bash
+ sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+ ```
- **Windows:**
- - Download Microsoft.VisualStudio.2022.BuildTools [Visual_Studio_Build_Tools]https://aka.ms/vs/17/release/vs_BuildTools.exe Select "Desktop development with C++" under workloads.
+ - Download Microsoft.VisualStudio.2022.BuildTools: [Visual_Studio_Build_Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe)
+ Select "Desktop development with C++" under workloads
- Install git
+ - Install OpenCL with vcpkg
+ ```powershell
+ cd C:\
+ git clone https://github.com/microsoft/vcpkg
+ cd vcpkg
+ bootstrap-vcpkg.bat
+ vcpkg install opencl
+ ```
- Use "x64 Native Tools Command Prompt" for Build
### 1. Install OpenVINO Runtime
@@ -625,19 +638,19 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu
-
+
```bash
wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
chmod +x install-openvino-from-archive.sh
./install-openvino-from-archive.sh
```
+
+ Verify OpenVINO is initialized properly:
+ ```bash
+ echo $OpenVINO_DIR
+ ```
- - Verify OpenVINO is initialized properly
- - **Linux:**
- ```bash
- echo $OpenVINO_DIR
- ```
### 2. Build llama.cpp with OpenVINO Backend
@@ -651,22 +664,16 @@ git switch dev_backend_openvino
- **Linux:**
```bash
- # Build with OpenVINO support
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF
- cmake --build build/ReleaseOV --config Release -j $(nproc)
+ cmake --build build/ReleaseOV --parallel
```
-- **Windows:**
+- **Windows:**
```bash
- # Build with OpenVINO support
"C:\Program Files (x86)\Intel\openvino_2025.3.0\setupvars.bat"
- cmake -B build/ReleaseOV -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF
- cmake --build build\ReleaseOV --config Release
- ```
- - For faster compilation, add the -- /m argument to run multiple jobs in parallel with as many CPU cores available.
- ```bash
- cmake --build build\ReleaseOV --config Release -- /m
+ cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
+ cmake --build build\ReleaseOV --parallel
```
### 3. Download Sample Model
@@ -674,16 +681,9 @@ git switch dev_backend_openvino
Download models for testing:
```bash
-# Create models directory
mkdir -p ~/models/
-
-# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
-wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
- -O ~/models/Llama-3.2-1B-Instruct.fp16.gguf
-
-# Download model file: Phi-3-mini-4k-instruct-fp16.gguf
-wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \
- -O ~/models/Phi-3-mini-4k-instruct-fp16.gguf
+wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
+ -O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
```
### 4. Run inference with OpenVINO backend:
@@ -691,20 +691,14 @@ wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/P
When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
```bash
-export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
-# Default device is GPU.
-# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU.
+# If device is unset or unavailable, default to CPU.
export GGML_OPENVINO_DEVICE=GPU
-
-./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
-
+./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
```
To run in chat mode:
```bash
-export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
-
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
```
### Configuration Options
@@ -716,16 +710,11 @@ Control OpenVINO behavior using these environment variables:
- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling.
- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`.
- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps.
-- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging.
-- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging.
### Example with Profiling
```bash
-export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
-export GGML_OPENVINO_PROFILING=1
-
-GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
+GGML_OPENVINO_PROFILING=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
```
### Docker build Llama.cpp with OpenVINO Backend
@@ -741,7 +730,7 @@ docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile
# Build a minimal CLI-only image containing just the llama-cli executable.
docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
-# Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
+# Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
# If you are behind a proxy:
@@ -764,17 +753,17 @@ llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
-```
+```
Run Llama.cpp Server with OpenVINO Backend
```bash
# Run the Server Docker container server
-docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
+docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
# In a NEW terminal, test the server with curl
# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
-export NO_PROXY=localhost,127.0.0.1
+export NO_PROXY=localhost,127.0.0.1
# Test health endpoint
curl -f http://localhost:8080/health
diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h
index b690a16378e..46c1485f663 100644
--- a/ggml/include/ggml-openvino.h
+++ b/ggml/include/ggml-openvino.h
@@ -18,9 +18,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);
+
+GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);
+
+GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
+
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);
+
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
@@ -43,8 +51,10 @@ struct ggml_openvino_device_info {
std::array default_tensor_split = {};
};
-const ggml_openvino_device_info & ggml_openvino_info();
-
#ifdef __cplusplus
}
#endif
+
+#ifdef __cplusplus
+const ggml_openvino_device_info & ggml_openvino_info();
+#endif
diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt
index 3051a8b2405..175b585661d 100644
--- a/ggml/src/ggml-openvino/CMakeLists.txt
+++ b/ggml/src/ggml-openvino/CMakeLists.txt
@@ -1,4 +1,5 @@
find_package(OpenVINO REQUIRED)
+find_package(OpenCL REQUIRED)
include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
@@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino
${GGML_HEADERS_OPENVINO}
)
-target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb)
+target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
if (GGML_OPENVINO)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 275a8a216ae..51fb433410c 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -2,6 +2,8 @@
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
+#include "ggml-openvino-extra.h"
+#include "ggml-openvino.h"
#include "ggml-quants.hpp"
#include
@@ -17,6 +19,7 @@
#include
#include