From c333a87e4b42058c3991c02dc76c095b96eb0ed8 Mon Sep 17 00:00:00 2001 From: Raghotham Murthy Date: Tue, 4 Nov 2025 13:07:43 -0800 Subject: [PATCH] feat: add support for llama stack helper --- README.md | 138 ++++ claude_desktop_config.json | 3 + examples/cc_vec_complete_rag_workflow.py | 5 + examples/cc_vec_rag_example.py | 8 + pyproject.toml | 7 + src/cc_vec/api.py | 8 + src/cc_vec/cli/main.py | 6 + src/cc_vec/lib/index.py | 47 +- src/cc_vec/mcp/handlers/cc_index.py | 8 + src/cc_vec/types/main_config.py | 7 + src/cc_vec/types/openai_config.py | 1 + src/llama_stack_helper/__init__.py | 5 + src/llama_stack_helper/conf/ollama-build.yaml | 34 + src/llama_stack_helper/conf/ollama-run.yaml | 109 +++ src/llama_stack_helper/llama_stack_helper.py | 710 ++++++++++++++++++ 15 files changed, 1095 insertions(+), 1 deletion(-) create mode 100644 src/llama_stack_helper/__init__.py create mode 100644 src/llama_stack_helper/conf/ollama-build.yaml create mode 100644 src/llama_stack_helper/conf/ollama-run.yaml create mode 100644 src/llama_stack_helper/llama_stack_helper.py diff --git a/README.md b/README.md index 7658e6f..40dc78f 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Search, analyze, and index Common Crawl data into vector stores for RAG applicat - **`AWS_SESSION_TOKEN`** - Optional for Athena/S3 access (needed to run Athena queries). This is required for temporary credentials - **`OPENAI_API_KEY`** - Required for vector operations (index, query, list) - `OPENAI_BASE_URL` - Optional custom OpenAI endpoint (e.g., `http://localhost:8321/v1` for Llama Stack) +- `OPENAI_VERIFY_SSL` - Verify SSL certificates (default: `true`). Set to `false` for self-signed certs or local development. ⚠️ Use only with trusted endpoints. - `OPENAI_EMBEDDING_MODEL` - Embedding model to use (e.g., `text-embedding-3-small`, `nomic-embed-text`) - `OPENAI_EMBEDDING_DIMENSIONS` - Embedding dimensions (optional, model-specific) - `AWS_DEFAULT_REGION` - AWS region (defaults to us-west-2) @@ -66,6 +67,22 @@ uv run cc-vec index --url-patterns "%.github.io" --vector-store-name "ml-researc # Vector store name is optional - will auto-generate if not provided uv run cc-vec index --url-patterns "%.github.io" --limit 50 +# Using with alternative OpenAI-compatible endpoints (Ollama example) +export OPENAI_BASE_URL=http://localhost:11434/v1 +export OPENAI_API_KEY=ollama # Ollama doesn't require a real key +export OPENAI_EMBEDDING_MODEL=nomic-embed-text +uv run cc-vec index --url-patterns "%.github.io" --vector-store-name "local-research" --limit 50 + +# Using with Llama Stack +export OPENAI_BASE_URL=http://localhost:8321/v1 +uv run cc-vec index --url-patterns "%.edu" --vector-store-name "education" --limit 100 + +# With self-signed certificates or local development (disable SSL verification) +export OPENAI_BASE_URL=https://localhost:8443/v1 +export OPENAI_VERIFY_SSL=false # ⚠️ Use only in development with trusted endpoints +export OPENAI_API_KEY=your-key +uv run cc-vec index --url-patterns "%.github.io" --vector-store-name "local-dev" --limit 50 + # List cc-vec vector stores (default - only shows stores created by cc-vec) uv run cc-vec list --output json @@ -80,9 +97,111 @@ uv run cc-vec query "Explain deep learning" --vector-store-name "ml-research" -- ``` +## 1.5. 🦙 Local Llama Stack Setup (Optional) + +For running cc-vec with local models via Llama Stack + Ollama, use the standalone manager script: + +**Prerequisites:** +- Ollama installed and running (`ollama serve`) +- Docker (for Docker backend) or uv (for UV backend) + +**First-time setup:** + +```bash +# Install and start Ollama first +# macOS/Linux: curl -fsSL https://ollama.com/install.sh | sh +ollama serve & + +# Run setup (pulls required models, installs dependencies) +uv run llama-stack-helper setup --backend docker + +# Or for UV backend: +uv run llama-stack-helper setup --backend uv +``` + +**Start Llama Stack:** + +```bash +# Docker backend (recommended) +uv run llama-stack-helper start --backend docker + +# Or UV backend +uv run llama-stack-helper start --backend uv +``` + +**Check status:** + +```bash +uv run llama-stack-helper status +``` + +**View logs:** + +```bash +# Show last 20 lines +uv run llama-stack-helper logs + +# Follow logs in real-time +uv run llama-stack-helper logs --follow +``` + +**Stop Llama Stack:** + +```bash +uv run llama-stack-helper stop --backend docker +``` + +**Use with cc-vec:** + +Once Llama Stack is running, set the environment variables: + +```bash +# Set Llama Stack environment variables in your current shell +eval "$(uv run llama-stack-helper env)" + +# Now use cc-vec normally with your Athena credentials +export ATHENA_OUTPUT_BUCKET=s3://your-bucket/ +export AWS_ACCESS_KEY_ID=your-key +export AWS_SECRET_ACCESS_KEY=your-secret + +uv run cc-vec index --url-patterns "%.edu" --limit 10 +``` + +The `env` command outputs (using your configured models): +```bash +export OPENAI_BASE_URL=http://localhost:8321/v1 +export OPENAI_API_KEY=none +export OPENAI_VERIFY_SSL=false +export OPENAI_EMBEDDING_MODEL=toshk0/nomic-embed-text-v2-moe:Q6_K # or your custom model +export OPENAI_EMBEDDING_DIMENSIONS=768 # or your custom dimensions +``` + +**Default models** (automatically pulled during setup): +- `llama3.2:3b` - Inference model +- `toshk0/nomic-embed-text-v2-moe:Q6_K` - Embedding model (768 dimensions) + +**Custom models** (optional): + +You can customize which models to use by setting environment variables before running setup: + +```bash +export LLAMA_STACK_INFERENCE_MODEL=llama3.2:1b +export LLAMA_STACK_EMBEDDING_MODEL=nomic-embed-text +export LLAMA_STACK_EMBEDDING_DIMENSIONS=768 + +# Now run setup - it will pull your custom models +uv run llama-stack-helper setup +``` + +These models will be: +1. Downloaded into Ollama during setup +2. Configured in the Llama Stack run.yaml +3. Used automatically by cc-vec when you run `eval "$(uv run llama-stack-helper env)"` + ## 2. 📦 Python Library ```python +import os from cc_vec import ( search, stats, @@ -95,6 +214,22 @@ from cc_vec import ( VectorStoreConfig, ) +# For alternative endpoints, set environment variables before importing +# Example: Using Ollama +# os.environ["OPENAI_BASE_URL"] = "http://localhost:11434/v1" +# os.environ["OPENAI_API_KEY"] = "ollama" +# os.environ["OPENAI_EMBEDDING_MODEL"] = "nomic-embed-text" + +# Example: Using Llama Stack +# os.environ["OPENAI_BASE_URL"] = "http://localhost:8321/v1" +# os.environ["OPENAI_API_KEY"] = "your-llama-stack-key" + +# Example: With self-signed certificates (disable SSL verification) +# ⚠️ Use only in development with trusted endpoints +# os.environ["OPENAI_BASE_URL"] = "https://localhost:8443/v1" +# os.environ["OPENAI_VERIFY_SSL"] = "false" +# os.environ["OPENAI_API_KEY"] = "your-key" + # Basic search and stats (no OpenAI key needed) filter_config = FilterConfig(url_patterns=["%.github.io"]) @@ -199,6 +334,9 @@ The config uses stdio mode (required by Claude Desktop): "env": { "ATHENA_OUTPUT_BUCKET": "your-athena-output-bucket", "OPENAI_API_KEY": "your-openai-api-key-here" + // "OPENAI_BASE_URL": "http://localhost:11434/v1" // Optional: Use for Ollama, Llama Stack, or other endpoints + // "OPENAI_VERIFY_SSL": "false" // Optional: Disable SSL verification for self-signed certs (dev only) + // "OPENAI_EMBEDDING_MODEL": "nomic-embed-text" // Optional: Specify custom embedding model } } } diff --git a/claude_desktop_config.json b/claude_desktop_config.json index 9fbe9f4..1ad556f 100644 --- a/claude_desktop_config.json +++ b/claude_desktop_config.json @@ -14,6 +14,9 @@ "env": { "ATHENA_OUTPUT_BUCKET": "s3://llama-stack-dev-test0/athena-results/", "OPENAI_API_KEY": "your-openai-api-key-here" + // "OPENAI_BASE_URL": "http://localhost:11434/v1" // Optional: Use for Ollama, Llama Stack, or other OpenAI-compatible endpoints + // "OPENAI_VERIFY_SSL": "false" // Optional: Disable SSL verification for self-signed certs (dev only) + // "OPENAI_EMBEDDING_MODEL": "nomic-embed-text" // Optional: Specify custom embedding model } } } diff --git a/examples/cc_vec_complete_rag_workflow.py b/examples/cc_vec_complete_rag_workflow.py index f7ae6ff..8cb435e 100644 --- a/examples/cc_vec_complete_rag_workflow.py +++ b/examples/cc_vec_complete_rag_workflow.py @@ -10,6 +10,11 @@ 6. Cleanup Run with: uv run python examples/complete_rag_workflow.py + +Optional environment variables for alternative endpoints: + - OPENAI_BASE_URL: Custom endpoint (e.g., http://localhost:11434/v1) + - OPENAI_VERIFY_SSL: Set to "false" for self-signed certs (dev only) + - OPENAI_EMBEDDING_MODEL: Custom model (e.g., nomic-embed-text) """ import os diff --git a/examples/cc_vec_rag_example.py b/examples/cc_vec_rag_example.py index 79b135f..ccb5980 100644 --- a/examples/cc_vec_rag_example.py +++ b/examples/cc_vec_rag_example.py @@ -10,6 +10,11 @@ - OPENAI_API_KEY environment variable - ATHENA_OUTPUT_BUCKET environment variable - AWS credentials configured + +Optional (for alternative endpoints): + - OPENAI_BASE_URL: Custom OpenAI-compatible endpoint (e.g., http://localhost:11434/v1 for Ollama) + - OPENAI_VERIFY_SSL: Set to "false" to disable SSL verification for self-signed certs (dev only) + - OPENAI_EMBEDDING_MODEL: Custom embedding model (e.g., nomic-embed-text for Ollama) """ import os @@ -20,6 +25,9 @@ def main(): # Initialize OpenAI client + # Note: If using alternative endpoints with self-signed certs, set: + # export OPENAI_BASE_URL=https://localhost:8443/v1 + # export OPENAI_VERIFY_SSL=false client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # Step 1: Index Common Crawl content into a vector store diff --git a/pyproject.toml b/pyproject.toml index 577edbf..3a59565 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,11 +27,18 @@ dev = [ [project.scripts] cc-vec = "cc_vec.cli.main:main" +llama-stack-helper = "llama_stack_helper:main" [build-system] requires = ["uv_build>=0.8.13,<0.9.0"] build-backend = "uv_build" +[tool.uv] +package = true + +[tool.setuptools.package-data] +llama_stack_helper = ["conf/*.yaml"] + [dependency-groups] dev = [ "pytest>=8.4.2", diff --git a/src/cc_vec/api.py b/src/cc_vec/api.py index c7a68e8..cfc41ab 100644 --- a/src/cc_vec/api.py +++ b/src/cc_vec/api.py @@ -3,6 +3,7 @@ import logging from typing import List, Dict, Any, Optional +import httpx from openai import OpenAI from .types import FilterConfig, CrawlRecord, StatsResponse, VectorStoreConfig from .types.config import load_config @@ -55,9 +56,16 @@ def _get_openai_client() -> OpenAI: global _openai_client if _openai_client is None: config = load_config() + + # Create custom httpx client if SSL verification is disabled + http_client = None + if not config.openai.verify_ssl: + http_client = httpx.Client(verify=False) + _openai_client = OpenAI( api_key=config.openai.api_key, base_url=config.openai.base_url, + http_client=http_client, ) return _openai_client diff --git a/src/cc_vec/cli/main.py b/src/cc_vec/cli/main.py index a396810..4be2b6c 100755 --- a/src/cc_vec/cli/main.py +++ b/src/cc_vec/cli/main.py @@ -651,11 +651,17 @@ def index(ctx, vector_store_name, limit, chunk_size, overlap, output, **filter_k click.echo(f"Auto-generated vector store name: {vector_store_name}") + # Load config to get embedding model settings from environment + config = load_config() + # Construct VectorStoreConfig vector_store_config = VectorStoreConfig( name=vector_store_name, chunk_size=chunk_size, overlap=overlap, + embedding_model=config.openai.embedding_model + or "text-embedding-3-small", + embedding_dimensions=config.openai.embedding_dimensions or 1536, ) # Use the simplified API that handles all client initialization diff --git a/src/cc_vec/lib/index.py b/src/cc_vec/lib/index.py index 9a972d2..23d591d 100644 --- a/src/cc_vec/lib/index.py +++ b/src/cc_vec/lib/index.py @@ -43,7 +43,7 @@ def create_vector_store(self) -> str: "created_by": "cc-vec", "cc_vec_version": "0.1.0", "embedding_model": self.config.embedding_model, - "embedding_dimension": str(self.config.embedding_dimensions), + "embedding_dimensions": str(self.config.embedding_dimensions), }, "chunking_strategy": { "type": "static", @@ -144,6 +144,34 @@ def upload_to_vector_store( logger.info(f"Upload completed with status: {file_batch.status}") logger.info(f"File counts: {file_batch.file_counts}") + # Log detailed failure information if any files failed + if file_batch.file_counts.failed > 0: + logger.warning(f"{file_batch.file_counts.failed} files failed to upload") + # Try to get detailed error information for failed files + try: + batch_files = self.client.vector_stores.file_batches.list_files( + vector_store_id=vector_store_id, + batch_id=file_batch.id, + filter="failed" + ) + for failed_file in batch_files.data[:3]: # Show first 3 failures + # Try multiple ways to get error information + error_msg = getattr(failed_file, 'last_error', None) + status = getattr(failed_file, 'status', 'unknown') + + if error_msg: + # last_error might be an object with message/code + if hasattr(error_msg, 'message'): + logger.error(f"File {failed_file.id} ({status}): {error_msg.message}") + else: + logger.error(f"File {failed_file.id} ({status}): {error_msg}") + else: + # Dump the entire object to see what's available + logger.error(f"File {failed_file.id} status: {status}") + logger.error(f"Full file object: {failed_file}") + except Exception as list_error: + logger.warning(f"Could not retrieve detailed failure information: {list_error}") + return { "status": file_batch.status, "file_counts": file_batch.file_counts, @@ -233,6 +261,23 @@ def index( upload_result = loader.upload_to_vector_store(vector_store_id, successful_fetches) + # Check if upload failed completely + file_counts = upload_result["file_counts"] + if upload_result["status"] == "failed" and file_counts.completed == 0: + error_msg = ( + f"All {file_counts.total} files failed to upload to vector store. " + f"Failed: {file_counts.failed}, Cancelled: {file_counts.cancelled}. " + f"Check the logs above for detailed error messages." + ) + logger.error(error_msg) + raise RuntimeError(error_msg) + + # Warn if some files failed but some succeeded + if file_counts.failed > 0 and file_counts.completed > 0: + logger.warning( + f"Partial upload success: {file_counts.completed} succeeded, {file_counts.failed} failed" + ) + return { "vector_store_id": vector_store_id, "vector_store_name": vector_store_config.name, diff --git a/src/cc_vec/mcp/handlers/cc_index.py b/src/cc_vec/mcp/handlers/cc_index.py index 4525e73..f75cae8 100644 --- a/src/cc_vec/mcp/handlers/cc_index.py +++ b/src/cc_vec/mcp/handlers/cc_index.py @@ -46,11 +46,19 @@ async def handle(self, args: Dict[str, Any]) -> List[TextContent]: else: vector_store_name = f"ccvec_{timestamp}" + # Load config to get embedding model settings from environment + from ...types.config import load_config + + config = load_config() + # Construct VectorStoreConfig vector_store_config = VectorStoreConfig( name=vector_store_name, chunk_size=chunk_size, overlap=overlap, + embedding_model=config.openai.embedding_model + or "text-embedding-3-small", + embedding_dimensions=config.openai.embedding_dimensions or 1536, ) try: diff --git a/src/cc_vec/types/main_config.py b/src/cc_vec/types/main_config.py index 96d70d6..0e429fe 100644 --- a/src/cc_vec/types/main_config.py +++ b/src/cc_vec/types/main_config.py @@ -34,6 +34,8 @@ def from_env(cls) -> "CCVecConfig": embedding_dimensions=int(os.getenv("OPENAI_EMBEDDING_DIMENSIONS")) if os.getenv("OPENAI_EMBEDDING_DIMENSIONS") else None, + verify_ssl=os.getenv("OPENAI_VERIFY_SSL", "true").lower() + not in ["false", "0", "no"], ), logging=LoggingSettings( level=os.getenv("LOG_LEVEL", "INFO"), @@ -62,6 +64,11 @@ def setup_logging(self) -> None: logger.info("OpenAI client configured") if self.openai.base_url: logger.info(f"Using custom OpenAI base URL: {self.openai.base_url}") + if not self.openai.verify_ssl: + logger.warning( + "⚠️ SSL certificate verification is DISABLED. " + "This should only be used in development with trusted endpoints." + ) else: logger.warning( "OpenAI API key not configured - vector operations unavailable" diff --git a/src/cc_vec/types/openai_config.py b/src/cc_vec/types/openai_config.py index 69c477d..5aab76c 100644 --- a/src/cc_vec/types/openai_config.py +++ b/src/cc_vec/types/openai_config.py @@ -12,6 +12,7 @@ class OpenAISettings: base_url: Optional[str] = None embedding_model: Optional[str] = None embedding_dimensions: Optional[int] = None + verify_ssl: bool = True def is_configured(self) -> bool: """Check if OpenAI is properly configured.""" diff --git a/src/llama_stack_helper/__init__.py b/src/llama_stack_helper/__init__.py new file mode 100644 index 0000000..fde20c1 --- /dev/null +++ b/src/llama_stack_helper/__init__.py @@ -0,0 +1,5 @@ +"""Llama Stack Helper - Manage Ollama + Llama Stack for cc-vec""" + +from .llama_stack_helper import main + +__all__ = ["main"] diff --git a/src/llama_stack_helper/conf/ollama-build.yaml b/src/llama_stack_helper/conf/ollama-build.yaml new file mode 100644 index 0000000..faebba1 --- /dev/null +++ b/src/llama_stack_helper/conf/ollama-build.yaml @@ -0,0 +1,34 @@ +version: 2 +distribution_spec: + description: Quick start template for running Llama Stack with several popular providers. + This distribution is intended for CPU-only environments. + providers: + inference: + - provider_type: remote::ollama + vector_io: + - provider_type: inline::faiss + files: + - provider_type: inline::localfs + safety: + - provider_type: inline::llama-guard + - provider_type: inline::code-scanner + agents: + - provider_type: inline::meta-reference + eval: + - provider_type: inline::meta-reference + datasetio: + - provider_type: remote::huggingface + - provider_type: inline::localfs + scoring: + - provider_type: inline::basic + - provider_type: inline::llm-as-judge + - provider_type: inline::braintrust + tool_runtime: + - provider_type: remote::model-context-protocol + batches: + - provider_type: inline::reference +image_type: venv +additional_pip_packages: +- aiosqlite +- asyncpg +- sqlalchemy[asyncio] diff --git a/src/llama_stack_helper/conf/ollama-run.yaml b/src/llama_stack_helper/conf/ollama-run.yaml new file mode 100644 index 0000000..9aaf171 --- /dev/null +++ b/src/llama_stack_helper/conf/ollama-run.yaml @@ -0,0 +1,109 @@ +version: 2 +image_name: starter +external_providers_dir: ${env.LLAMA_STACK_DATA_DIR:=~/.llama}/providers.d +apis: +- inference +- vector_io +- files +- safety +- tool_runtime +- agents + +providers: + inference: + # Single Ollama provider for all models + - provider_id: ollama + provider_type: remote::ollama + config: + url: ${env.OLLAMA_URL:=http://localhost:11434} + + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + persistence: + namespace: vector_io::faiss + backend: kv_default + + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.LLAMA_STACK_DATA_DIR:=~/.llama}/files + metadata_store: + table_name: files_metadata + backend: sql_default + + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence: + agent_state: + namespace: agents + backend: kv_default + responses: + table_name: responses + backend: sql_default + max_write_queue_size: 10000 + num_writers: 4 + +storage: + backends: + kv_default: + type: kv_sqlite + db_path: ${env.LLAMA_STACK_DATA_DIR:=~/.llama}/kvstore.db + sql_default: + type: sql_sqlite + db_path: ${env.LLAMA_STACK_DATA_DIR:=~/.llama}/sql_store.db + stores: + metadata: + namespace: registry + backend: kv_default + inference: + table_name: inference_store + backend: sql_default + max_write_queue_size: 10000 + num_writers: 4 + conversations: + table_name: openai_conversations + backend: sql_default + +registered_resources: + models: + # Inference model (configurable via LLAMA_STACK_INFERENCE_MODEL) + - model_id: ${env.LLAMA_STACK_INFERENCE_MODEL:=llama3.2:3b} + provider_id: ollama + provider_model_id: ${env.LLAMA_STACK_INFERENCE_MODEL:=llama3.2:3b} + model_type: llm + # Embedding model (configurable via LLAMA_STACK_EMBEDDING_MODEL) + - model_id: ${env.LLAMA_STACK_EMBEDDING_MODEL:=toshk0/nomic-embed-text-v2-moe:Q6_K} + provider_id: ollama + provider_model_id: ${env.LLAMA_STACK_EMBEDDING_MODEL:=toshk0/nomic-embed-text-v2-moe:Q6_K} + model_type: embedding + metadata: + embedding_dimension: ${env.LLAMA_STACK_EMBEDDING_DIMENSIONS:=768} + shields: [] + vector_dbs: [] + datasets: [] + scoring_fns: [] + benchmarks: [] + tool_groups: [] + +server: + port: 8321 + +telemetry: + enabled: true + +vector_stores: + default_provider_id: faiss + default_embedding_model: + provider_id: ollama + model_id: ${env.LLAMA_STACK_EMBEDDING_MODEL:=toshk0/nomic-embed-text-v2-moe:Q6_K} diff --git a/src/llama_stack_helper/llama_stack_helper.py b/src/llama_stack_helper/llama_stack_helper.py new file mode 100644 index 0000000..95cd0ec --- /dev/null +++ b/src/llama_stack_helper/llama_stack_helper.py @@ -0,0 +1,710 @@ +#!/usr/bin/env python3 +""" +Llama Stack Helper - Standalone script to manage Ollama + Llama Stack + +This script can: +- Check and help install Ollama +- Pull required models +- Start Llama Stack (Docker or local via uv) +- Manage lifecycle (start/stop/status/logs) + +Usage: + uv run llama-stack-helper setup + uv run llama-stack-helper start --backend docker + uv run llama-stack-helper stop + uv run llama-stack-helper status +""" + +import argparse +import json +import os +import signal +import subprocess +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional + +# Model configuration (can be overridden via environment variables) +DEFAULT_INFERENCE_MODEL = "llama3.2:3b" +DEFAULT_EMBEDDING_MODEL = "toshk0/nomic-embed-text-v2-moe:Q6_K" +DEFAULT_EMBEDDING_DIMENSIONS = 768 + +# Get models from environment or use defaults +INFERENCE_MODEL = os.getenv("LLAMA_STACK_INFERENCE_MODEL", DEFAULT_INFERENCE_MODEL) +EMBEDDING_MODEL = os.getenv("LLAMA_STACK_EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL) +EMBEDDING_DIMENSIONS = int(os.getenv("LLAMA_STACK_EMBEDDING_DIMENSIONS", str(DEFAULT_EMBEDDING_DIMENSIONS))) + +# Required models based on configuration +REQUIRED_MODELS = { + INFERENCE_MODEL: "Inference model", + EMBEDDING_MODEL: f"Embedding model ({EMBEDDING_DIMENSIONS} dimensions)", +} + +# Default configuration +DEFAULT_PORT = 8321 +DEFAULT_DATA_DIR = Path.home() / ".llama" +DEFAULT_OLLAMA_URL = "http://localhost:11434" + +# Colors for output +class Colors: + GREEN = '\033[0;32m' + RED = '\033[0;31m' + YELLOW = '\033[1;33m' + BLUE = '\033[0;34m' + NC = '\033[0m' + + +def log_info(msg: str): + print(f"{Colors.GREEN}[INFO]{Colors.NC} {msg}") + + +def log_error(msg: str): + print(f"{Colors.RED}[ERROR]{Colors.NC} {msg}") + + +def log_warn(msg: str): + print(f"{Colors.YELLOW}[WARN]{Colors.NC} {msg}") + + +def log_step(msg: str): + print(f"{Colors.BLUE}[STEP]{Colors.NC} {msg}") + + +def log_success(msg: str): + print(f"{Colors.GREEN}✓{Colors.NC} {msg}") + + +class OllamaManager: + """Manages Ollama installation and models""" + + def __init__(self, url: str = DEFAULT_OLLAMA_URL): + self.url = url + + def is_running(self) -> bool: + """Check if Ollama is accessible""" + try: + import requests + response = requests.get(f"{self.url}/api/tags", timeout=2) + return response.status_code == 200 + except Exception: + return False + + def list_models(self) -> List[str]: + """List available models""" + try: + import requests + response = requests.get(f"{self.url}/api/tags", timeout=5) + if response.status_code == 200: + return [m["name"] for m in response.json().get("models", [])] + except Exception: + pass + return [] + + def has_model(self, model: str) -> bool: + """Check if specific model is available""" + models = self.list_models() + return model in models + + def pull_model(self, model: str) -> bool: + """Pull a model from Ollama""" + log_info(f"Pulling model: {model}") + try: + result = subprocess.run( + ["ollama", "pull", model], + check=True, + capture_output=False + ) + return result.returncode == 0 + except subprocess.CalledProcessError: + log_error(f"Failed to pull model: {model}") + return False + except FileNotFoundError: + log_error("Ollama CLI not found. Is Ollama installed?") + return False + + def get_installation_instructions(self) -> str: + """Get platform-specific installation instructions""" + return """ +Install Ollama: + macOS/Linux: curl -fsSL https://ollama.com/install.sh | sh + Windows: Download from https://ollama.com/download + +Then start Ollama: + ollama serve + +Or on macOS, Ollama runs automatically after installation. +""" + + +class DockerBackend: + """Manages Llama Stack via Docker""" + + def __init__(self, port: int = DEFAULT_PORT): + self.port = port + self.container_name = "llama-stack-cc-vec" + self.image = "llamastack/distribution-starter" + self.config_path = self._get_config_path() + + def _get_config_path(self) -> Path: + """Get config path""" + script_dir = Path(__file__).parent + config = script_dir / "conf" / "ollama-run.yaml" + if not config.exists(): + log_error(f"Config not found: {config}") + sys.exit(1) + return config + + def check_docker(self) -> bool: + """Check if Docker is available""" + try: + subprocess.run( + ["docker", "--version"], + check=True, + capture_output=True + ) + return True + except (subprocess.CalledProcessError, FileNotFoundError): + return False + + def is_running(self) -> bool: + """Check if container is running""" + try: + result = subprocess.run( + ["docker", "ps", "--filter", f"name={self.container_name}", "--format", "{{.Names}}"], + capture_output=True, + text=True, + check=True + ) + return self.container_name in result.stdout + except subprocess.CalledProcessError: + return False + + def start(self) -> bool: + """Start Docker container""" + if self.is_running(): + log_warn("Llama Stack container already running") + return True + + log_info(f"Starting Llama Stack (Docker) on port {self.port}") + + # Remove old container if exists + subprocess.run( + ["docker", "rm", "-f", self.container_name], + capture_output=True + ) + + try: + subprocess.run([ + "docker", "run", "-d", + "--name", self.container_name, + "-p", f"{self.port}:{self.port}", + "-v", f"{DEFAULT_DATA_DIR}:/root/.llama", + "-v", f"{self.config_path}:/app/run.yaml", + "-e", "RUN_CONFIG_PATH=/app/run.yaml", + "-e", "LLAMA_STACK_DATA_DIR=/root/.llama", + "-e", f"OLLAMA_URL=http://host.docker.internal:11434", + "-e", f"LLAMA_STACK_INFERENCE_MODEL={INFERENCE_MODEL}", + "-e", f"LLAMA_STACK_EMBEDDING_MODEL={EMBEDDING_MODEL}", + "-e", f"LLAMA_STACK_EMBEDDING_DIMENSIONS={EMBEDDING_DIMENSIONS}", + self.image, + "--port", str(self.port) + ], check=True) + + log_info("Waiting for Llama Stack to be ready...") + if self._wait_for_health(timeout=30): + log_success(f"Llama Stack running at http://localhost:{self.port}") + return True + else: + log_error("Llama Stack failed to become healthy") + return False + + except subprocess.CalledProcessError as e: + log_error(f"Failed to start Docker container: {e}") + return False + + def stop(self) -> bool: + """Stop Docker container""" + if not self.is_running(): + log_warn("Llama Stack container not running") + return True + + try: + subprocess.run(["docker", "stop", self.container_name], check=True) + subprocess.run(["docker", "rm", self.container_name], check=True) + log_success("Llama Stack stopped") + return True + except subprocess.CalledProcessError as e: + log_error(f"Failed to stop container: {e}") + return False + + def logs(self, tail: int = 20, follow: bool = False) -> None: + """Show container logs""" + cmd = ["docker", "logs"] + if follow: + cmd.append("-f") + cmd.extend(["--tail", str(tail), self.container_name]) + + try: + subprocess.run(cmd) + except subprocess.CalledProcessError: + log_error("Failed to get logs") + + def _wait_for_health(self, timeout: int = 30) -> bool: + """Wait for service to be healthy""" + import requests + start = time.time() + while time.time() - start < timeout: + try: + response = requests.get(f"http://localhost:{self.port}/health", timeout=2) + if response.status_code == 200: + return True + except Exception: + pass + time.sleep(2) + return False + + +class LocalBackend: + """Manages Llama Stack via local uv/venv""" + + def __init__(self, port: int = DEFAULT_PORT, data_dir: Path = DEFAULT_DATA_DIR): + self.port = port + self.data_dir = data_dir + self.pid_file = self.data_dir / "llamastack.pid" + self.log_file = self.data_dir / "llamastack.log" + self.config_path = self._get_config_path() + + def _get_config_path(self) -> Path: + """Get config path""" + script_dir = Path(__file__).parent + config = script_dir / "conf" / "ollama-run.yaml" + if not config.exists(): + log_error(f"Config not found: {config}") + sys.exit(1) + return config + + def check_dependencies(self) -> bool: + """Check if llama CLI is available""" + try: + subprocess.run( + ["llama", "--version"], + check=True, + capture_output=True + ) + return True + except (subprocess.CalledProcessError, FileNotFoundError): + return False + + def setup(self) -> bool: + """Install dependencies via uv""" + log_step("Installing Llama Stack dependencies via uv...") + + try: + subprocess.run([ + "uv", "pip", "install", + "llama-stack", + "llama-stack-client", + "faiss-cpu", + "aiosqlite", + "sqlalchemy[asyncio]" + ], check=True) + + # Create data directories + self.data_dir.mkdir(parents=True, exist_ok=True) + (self.data_dir / "files").mkdir(exist_ok=True) + (self.data_dir / "providers.d").mkdir(exist_ok=True) + + log_success("Dependencies installed") + return True + + except subprocess.CalledProcessError as e: + log_error(f"Failed to install dependencies: {e}") + return False + + def is_running(self) -> bool: + """Check if process is running""" + if not self.pid_file.exists(): + return False + + try: + pid = int(self.pid_file.read_text()) + os.kill(pid, 0) + return True + except (ProcessLookupError, ValueError): + self.pid_file.unlink(missing_ok=True) + return False + + def start(self) -> bool: + """Start Llama Stack locally""" + if self.is_running(): + log_warn("Llama Stack already running") + return True + + if not self.check_dependencies(): + log_error("Llama Stack dependencies not installed") + log_info("Run: uv run llama-stack-helper setup") + return False + + log_info(f"Starting Llama Stack (local) on port {self.port}") + + # Set environment variables + env = os.environ.copy() + env["LLAMA_STACK_DATA_DIR"] = str(self.data_dir) + env["LLAMA_STACK_PORT"] = str(self.port) + env["OLLAMA_URL"] = DEFAULT_OLLAMA_URL + env["LLAMA_STACK_INFERENCE_MODEL"] = INFERENCE_MODEL + env["LLAMA_STACK_EMBEDDING_MODEL"] = EMBEDDING_MODEL + env["LLAMA_STACK_EMBEDDING_DIMENSIONS"] = str(EMBEDDING_DIMENSIONS) + + try: + # Start in background + log_file = open(self.log_file, "w") + process = subprocess.Popen( + ["uv", "run", "llama", "stack", "run", str(self.config_path), "--port", str(self.port)], + stdout=log_file, + stderr=subprocess.STDOUT, + env=env + ) + + # Save PID + self.pid_file.write_text(str(process.pid)) + + log_info("Waiting for Llama Stack to be ready...") + if self._wait_for_health(timeout=30): + log_success(f"Llama Stack running at http://localhost:{self.port}") + log_info(f"Logs: tail -f {self.log_file}") + return True + else: + log_error("Llama Stack failed to become healthy") + log_info(f"Check logs: tail -f {self.log_file}") + return False + + except Exception as e: + log_error(f"Failed to start Llama Stack: {e}") + return False + + def stop(self) -> bool: + """Stop Llama Stack""" + if not self.is_running(): + log_warn("Llama Stack not running") + return True + + try: + pid = int(self.pid_file.read_text()) + os.kill(pid, signal.SIGTERM) + + # Wait for graceful shutdown + for _ in range(10): + try: + os.kill(pid, 0) + time.sleep(1) + except ProcessLookupError: + break + + # Force kill if still running + try: + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + pass + + self.pid_file.unlink(missing_ok=True) + log_success("Llama Stack stopped") + return True + + except Exception as e: + log_error(f"Failed to stop Llama Stack: {e}") + return False + + def logs(self, tail: int = 20, follow: bool = False) -> None: + """Show logs""" + if not self.log_file.exists(): + log_warn("No log file found") + return + + if follow: + subprocess.run(["tail", "-f", str(self.log_file)]) + else: + subprocess.run(["tail", "-n", str(tail), str(self.log_file)]) + + def _wait_for_health(self, timeout: int = 30) -> bool: + """Wait for service to be healthy""" + import requests + start = time.time() + while time.time() - start < timeout: + try: + response = requests.get(f"http://localhost:{self.port}/health", timeout=2) + if response.status_code == 200: + return True + except Exception: + pass + time.sleep(2) + return False + + +def cmd_setup(args): + """Setup: Check Ollama, pull models, install dependencies""" + log_step("Step 1/3: Checking Ollama") + + ollama = OllamaManager(args.ollama_url) + + if not ollama.is_running(): + log_error("Ollama is not running") + print(ollama.get_installation_instructions()) + sys.exit(1) + + log_success("Ollama is running") + + log_step("Step 2/3: Checking required models") + + for model, description in REQUIRED_MODELS.items(): + if ollama.has_model(model): + log_success(f"{description}: {model}") + else: + log_warn(f"{description}: {model} not found") + if args.non_interactive or input(f"Pull {model}? [Y/n]: ").lower() != 'n': + if not ollama.pull_model(model): + log_error(f"Failed to pull {model}") + sys.exit(1) + + log_step("Step 3/3: Installing dependencies") + + if args.backend == "uv": + local = LocalBackend(port=args.port) + if not local.setup(): + sys.exit(1) + else: + docker = DockerBackend(port=args.port) + if not docker.check_docker(): + log_error("Docker not found. Install Docker or use --backend uv") + sys.exit(1) + log_info("Docker backend ready (no additional setup needed)") + + print() + log_success("Setup complete!") + log_info(f"Start with: uv run llama-stack-helper start --backend {args.backend}") + + +def cmd_start(args): + """Start Llama Stack""" + # Check Ollama first + ollama = OllamaManager(args.ollama_url) + if not ollama.is_running(): + log_error("Ollama is not running. Start it first:") + log_info(" ollama serve") + sys.exit(1) + + # Check models + missing = [m for m in REQUIRED_MODELS if not ollama.has_model(m)] + if missing: + log_error(f"Missing required models: {missing}") + log_info("Run: uv run llama-stack-helper setup") + sys.exit(1) + + # Start backend + if args.backend == "docker": + backend = DockerBackend(port=args.port) + if not backend.check_docker(): + log_error("Docker not available. Use --backend uv") + sys.exit(1) + success = backend.start() + else: + backend = LocalBackend(port=args.port) + success = backend.start() + + if success: + print() + log_success("Llama Stack is ready!") + print() + log_info("To use with cc-vec, run:") + print(f' eval "$(uv run llama-stack-helper env)"') + print() + log_info("Then use cc-vec normally with your Athena credentials") + else: + sys.exit(1) + + +def cmd_stop(args): + """Stop Llama Stack""" + if args.backend == "docker": + backend = DockerBackend() + if not backend.check_docker(): + log_error("Docker not available. Cannot stop Docker backend.") + sys.exit(1) + backend.stop() + else: + backend = LocalBackend() + backend.stop() + + +def cmd_status(args): + """Show status""" + log_info("Checking status...") + print() + + # Check Ollama + ollama = OllamaManager(args.ollama_url) + print("Ollama:") + if ollama.is_running(): + log_success(f" Running at {args.ollama_url}") + models = ollama.list_models() + print(f" Models: {len(models)}") + for model, desc in REQUIRED_MODELS.items(): + has = ollama.has_model(model) + status = "✓" if has else "✗" + print(f" {status} {desc}: {model}") + else: + log_error(f" Not running at {args.ollama_url}") + + print() + + # Check Llama Stack + docker = DockerBackend(port=args.port) + local = LocalBackend(port=args.port) + + print("Llama Stack:") + + # Check docker backend if docker is available + docker_available = docker.check_docker() + if docker_available and docker.is_running(): + log_success(f" Running (Docker) at http://localhost:{args.port}") + elif local.is_running(): + log_success(f" Running (UV) at http://localhost:{args.port}") + else: + log_warn(" Not running") + if not docker_available: + print(" (Docker not available on this system)") + + +def cmd_logs(args): + """Show logs""" + docker = DockerBackend() + local = LocalBackend() + + if docker.is_running(): + docker.logs(tail=args.tail, follow=args.follow) + elif local.is_running(): + local.logs(tail=args.tail, follow=args.follow) + else: + log_error("Llama Stack not running") + sys.exit(1) + + +def cmd_env(args): + """Output environment variables for cc-vec""" + docker = DockerBackend(port=args.port) + local = LocalBackend(port=args.port) + + if not docker.is_running() and not local.is_running(): + log_error("Llama Stack is not running. Start it first with:") + log_info(" uv run llama-stack-helper start") + sys.exit(1) + + # Output in shell-sourceable format + print(f"export OPENAI_BASE_URL=http://localhost:{args.port}/v1") + print(f"export OPENAI_API_KEY=none") + print(f"export OPENAI_VERIFY_SSL=false") + print(f"export OPENAI_EMBEDDING_MODEL={EMBEDDING_MODEL}") + print(f"export OPENAI_EMBEDDING_DIMENSIONS={EMBEDDING_DIMENSIONS}") + + + + +def main(): + parser = argparse.ArgumentParser( + description="Manage Ollama + Llama Stack for cc-vec", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # First time setup (uses default models) + uv run llama-stack-helper setup + + # Setup with custom models + export LLAMA_STACK_INFERENCE_MODEL=llama3.2:1b + export LLAMA_STACK_EMBEDDING_MODEL=nomic-embed-text + export LLAMA_STACK_EMBEDDING_DIMENSIONS=768 + uv run llama-stack-helper setup + + # Start Llama Stack (Docker) + uv run llama-stack-helper start --backend docker + + # Start Llama Stack (UV/Local) + uv run llama-stack-helper start --backend uv + + # Check status + uv run llama-stack-helper status + + # Set environment variables in your shell + eval "$(uv run llama-stack-helper env)" + + # Now use cc-vec normally with Athena env vars + Llama Stack + uv run cc-vec index --url-patterns "%.edu" --limit 10 + + # View logs + uv run llama-stack-helper logs --follow + + # Stop + uv run llama-stack-helper stop + +Environment Variables: + LLAMA_STACK_INFERENCE_MODEL Inference model (default: llama3.2:3b) + LLAMA_STACK_EMBEDDING_MODEL Embedding model (default: toshk0/nomic-embed-text-v2-moe:Q6_K) + LLAMA_STACK_EMBEDDING_DIMENSIONS Embedding dimensions (default: 768) + """ + ) + + parser.add_argument("--ollama-url", default=DEFAULT_OLLAMA_URL, help="Ollama URL") + parser.add_argument("--port", type=int, default=DEFAULT_PORT, help="Llama Stack port") + + subparsers = parser.add_subparsers(dest="command", required=True) + + # Setup command + setup_parser = subparsers.add_parser("setup", help="Setup Ollama and Llama Stack") + setup_parser.add_argument("--backend", choices=["docker", "uv"], default="docker", help="Backend to use") + setup_parser.add_argument("--non-interactive", action="store_true", help="Non-interactive mode") + + # Start command + start_parser = subparsers.add_parser("start", help="Start Llama Stack") + start_parser.add_argument("--backend", choices=["docker", "uv"], default="docker", help="Backend to use") + + # Stop command + stop_parser = subparsers.add_parser("stop", help="Stop Llama Stack") + stop_parser.add_argument("--backend", choices=["docker", "uv"], default="docker", help="Backend to stop") + + # Status command + subparsers.add_parser("status", help="Check status") + + # Logs command + logs_parser = subparsers.add_parser("logs", help="Show logs") + logs_parser.add_argument("--tail", type=int, default=20, help="Number of lines to show") + logs_parser.add_argument("--follow", "-f", action="store_true", help="Follow log output") + + # Env command + subparsers.add_parser("env", help="Output environment variables for cc-vec") + + args = parser.parse_args() + + # Check for requests module + try: + import requests + except ImportError: + log_error("requests module not found. Install it with: uv pip install requests") + sys.exit(1) + + # Dispatch commands + if args.command == "setup": + cmd_setup(args) + elif args.command == "start": + cmd_start(args) + elif args.command == "stop": + cmd_stop(args) + elif args.command == "status": + cmd_status(args) + elif args.command == "logs": + cmd_logs(args) + elif args.command == "env": + cmd_env(args) + + +if __name__ == "__main__": + main()