Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 34 additions & 45 deletions docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -610,10 +610,23 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
```
- OpenCL
```bash
sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
```

- **Windows:**
- Download Microsoft.VisualStudio.2022.BuildTools [Visual_Studio_Build_Tools]https://aka.ms/vs/17/release/vs_BuildTools.exe Select "Desktop development with C++" under workloads.
- Download Microsoft.VisualStudio.2022.BuildTools: [Visual_Studio_Build_Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe)
Select "Desktop development with C++" under workloads
- Install git
- Install OpenCL with vcpkg
```powershell
cd C:\
git clone https://github.com/microsoft/vcpkg
cd vcpkg
bootstrap-vcpkg.bat
vcpkg install opencl
```
- Use "x64 Native Tools Command Prompt" for Build

### 1. Install OpenVINO Runtime
Expand All @@ -625,19 +638,19 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
<details>
<summary>📦 Click to expand OpenVINO 2025.3 installation from an archive file on Ubuntu</summary>
<br>

```bash
wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
chmod +x install-openvino-from-archive.sh
./install-openvino-from-archive.sh
```

Verify OpenVINO is initialized properly:
```bash
echo $OpenVINO_DIR
```
</details>

- Verify OpenVINO is initialized properly
- **Linux:**
```bash
echo $OpenVINO_DIR
```

### 2. Build llama.cpp with OpenVINO Backend

Expand All @@ -651,60 +664,41 @@ git switch dev_backend_openvino

- **Linux:**
```bash
# Build with OpenVINO support
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF
cmake --build build/ReleaseOV --config Release -j $(nproc)
cmake --build build/ReleaseOV --parallel
```

- **Windows:**
- **Windows:**
```bash
# Build with OpenVINO support
"C:\Program Files (x86)\Intel\openvino_2025.3.0\setupvars.bat"
cmake -B build/ReleaseOV -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF
cmake --build build\ReleaseOV --config Release
```
- For faster compilation, add the -- /m argument to run multiple jobs in parallel with as many CPU cores available.
```bash
cmake --build build\ReleaseOV --config Release -- /m
cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
cmake --build build\ReleaseOV --parallel
```

### 3. Download Sample Model

Download models for testing:

```bash
# Create models directory
mkdir -p ~/models/

# Download model file: Llama-3.2-1B-Instruct.fp16.gguf
wget https://huggingface.co/MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct.fp16.gguf \
-O ~/models/Llama-3.2-1B-Instruct.fp16.gguf

# Download model file: Phi-3-mini-4k-instruct-fp16.gguf
wget https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-fp16.gguf \
-O ~/models/Phi-3-mini-4k-instruct-fp16.gguf
wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
-O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
```

### 4. Run inference with OpenVINO backend:

When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.

```bash
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
# Default device is GPU.
# If not set, automatically selects the first available device in priority order: GPU, CPU, NPU.
# If device is unset or unavailable, default to CPU.
export GGML_OPENVINO_DEVICE=GPU

./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "

./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
```

To run in chat mode:
```bash
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "

./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
```

### Configuration Options
Expand All @@ -716,16 +710,11 @@ Control OpenVINO behavior using these environment variables:
- **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling.
- **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`.
- **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps.
- **`GGML_OPENVINO_DEBUG_INPUT`**: Enable input debugging.
- **`GGML_OPENVINO_DEBUG_OUTPUT`**: Enable output debugging.

### Example with Profiling

```bash
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
export GGML_OPENVINO_PROFILING=1

GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is "
GGML_OPENVINO_PROFILING=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
```

### Docker build Llama.cpp with OpenVINO Backend
Expand All @@ -741,7 +730,7 @@ docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile
# Build a minimal CLI-only image containing just the llama-cli executable.
docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .

# Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
# Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .

# If you are behind a proxy:
Expand All @@ -764,17 +753,17 @@ llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
```
```

Run Llama.cpp Server with OpenVINO Backend
```bash
# Run the Server Docker container server
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct.fp16.gguf

# In a NEW terminal, test the server with curl

# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
export NO_PROXY=localhost,127.0.0.1
export NO_PROXY=localhost,127.0.0.1

# Test health endpoint
curl -f http://localhost:8080/health
Expand Down
14 changes: 12 additions & 2 deletions ggml/include/ggml-openvino.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);

GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);

GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);

GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);

GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);

// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);

GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);

GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);

GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
Expand All @@ -43,8 +51,10 @@ struct ggml_openvino_device_info {
std::array<float, GGML_OPENVINO_MAX_DEVICES> default_tensor_split = {};
};

const ggml_openvino_device_info & ggml_openvino_info();

#ifdef __cplusplus
}
#endif

#ifdef __cplusplus
const ggml_openvino_device_info & ggml_openvino_info();
#endif
3 changes: 2 additions & 1 deletion ggml/src/ggml-openvino/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
find_package(OpenVINO REQUIRED)
find_package(OpenCL REQUIRED)

include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")

Expand All @@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino
${GGML_HEADERS_OPENVINO}
)

target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb)
target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)

if (GGML_OPENVINO)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
Expand Down
Loading