From e8dda09fa1cac1071d590f06c36feb9a19454d59 Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sat, 10 Jan 2026 11:19:57 -0600 Subject: [PATCH 01/23] upload codebase. --- CMakeLists.txt | 96 +++ README.md | 163 +++- cmake/black_format.cmake | 53 ++ cmake/build_helper.cmake | 173 +++++ cmake/clang_format.cmake | 71 ++ cmake/misc.cmake | 30 + cmake/module/FindClangFormat.cmake | 59 ++ cmake/python_env.cmake | 306 ++++++++ cmake/third_party.cmake | 60 ++ cmake/utils.cmake | 39 + config/README.md | 134 ++++ config/example_elastic.yaml | 65 ++ config/example_qdrant.yaml | 65 ++ config/lance_insert.yaml | 50 ++ config/lance_query.yaml | 60 ++ config/milvus_insert.yaml | 49 ++ config/milvus_query.yaml | 58 ++ config/monitor/example_config.yaml | 48 ++ config/pdfimage/lance_insert_pdfimage.yaml | 49 ++ config/pdfimage/lance_query_pdfimage.yaml | 58 ++ config/pdfimage/milvus_insert_pdfimage.yaml | 49 ++ config/pdfimage/milvus_query_pdfimage.yaml | 58 ++ config/pdftext/lance_insert_pdftext.yaml | 49 ++ config/pdftext/lance_query_pdftext.yaml | 60 ++ doc/figures/ragconfig.png | Bin 0 -> 109898 bytes doc/figures/run.png | Bin 0 -> 38022 bytes example/.gitignore | 14 + example/CMakeLists.txt | 1 + example/monitoring_sys_lib/common.py | 21 + example/monitoring_sys_lib/test_parser.py | 242 ++++++ example/monitoring_sys_lib/test_parser_new.py | 708 ++++++++++++++++++ example/monitoring_sys_lib/test_run.py | 34 + monitoring_sys/.gitignore | 1 + monitoring_sys/CMakeLists.txt | 120 +++ monitoring_sys/README.md | 179 +++++ monitoring_sys/include/cpu_meter.hh | 22 + monitoring_sys/include/disk_meter.hh | 22 + monitoring_sys/include/gpu_meter.hh | 50 ++ monitoring_sys/include/logger.hh | 12 + monitoring_sys/include/mem_meter.hh | 26 + monitoring_sys/include/meter.hh | 122 +++ monitoring_sys/include/meter.ipp | 10 + monitoring_sys/include/msys.hh | 228 ++++++ monitoring_sys/include/proc_meter.hh | 28 + monitoring_sys/include/utils.hh | 259 +++++++ monitoring_sys/include/utils.ipp | 54 ++ monitoring_sys/msys_defs.cmake | 60 ++ monitoring_sys/src/cpu_meter.cc | 168 +++++ monitoring_sys/src/disk_meter.cc | 182 +++++ monitoring_sys/src/gpu_meter.cc | 273 +++++++ monitoring_sys/src/interface.cc | 276 +++++++ monitoring_sys/src/logger.cc | 142 ++++ monitoring_sys/src/mem_meter.cc | 232 ++++++ monitoring_sys/src/meter.cc | 158 ++++ monitoring_sys/src/msys.cc | 665 ++++++++++++++++ monitoring_sys/src/proc_meter.cc | 276 +++++++ monitoring_sys/src/utils.cc | 422 +++++++++++ parser.py | 67 ++ req.txt | 327 ++++++++ resource/.gitignore | 1 + resource/bash_utils.sh | 431 +++++++++++ resource/black_format/.black-format | 17 + .../build_helper/libclang_get_lib_version.py | 56 ++ .../py3_require_executable_module.py | 25 + resource/build_helper/py3_require_package.py | 34 + resource/clang_format/.clang-format | 23 + resource/clang_format/run_clang_format.py | 432 +++++++++++ resource/proto/cpu_metrics.proto | 48 ++ resource/proto/disk_metrics.proto | 45 ++ resource/proto/gpu_metrics.proto | 119 +++ resource/proto/mem_metrics.proto | 163 ++++ resource/proto/proc_metrics.proto | 89 +++ resource/requirements.in | 37 + resource/setup.sh | 11 + script/run_insert.sh | 28 + src/.gitignore | 9 + src/RAGPipeline/BaseRAGPipline.py | 17 + src/RAGPipeline/ImageRAGPipline.py | 143 ++++ src/RAGPipeline/TextsRAGPipline.py | 198 +++++ src/RAGPipeline/__init__.py | 2 + src/RAGPipeline/reranker/BaseReranker.py | 23 + .../reranker/CrossEncoderReranker.py | 58 ++ src/RAGPipeline/reranker/__init__.py | 2 + src/RAGPipeline/responser/BaseResponser.py | 21 + src/RAGPipeline/responser/ImagesResponser.py | 71 ++ src/RAGPipeline/responser/TextsResponser.py | 54 ++ src/RAGPipeline/responser/__init__.py | 2 + src/RAGPipeline/retriever/BaseRetriever.py | 108 +++ src/RAGPipeline/retriever/__init__.py | 2 + src/RAGRequest/BaseRAGRequest.py | 29 + src/RAGRequest/TextsRAGRequest.py | 40 + src/RAGRequest/__init__.py | 2 + src/__init__.py | 2 + src/config.py | 156 ++++ src/datasetLoader/BaseDatasetLoader.py | 14 + src/datasetLoader/PDFDatasetLoader.py | 103 +++ src/datasetLoader/TextDatasetLoader.py | 46 ++ src/datasetLoader/__init__.py | 2 + .../BaseDatasetPreprocess.py | 22 + src/datasetPreprocess/PDFDatasetPreprocess.py | 116 +++ .../TextDatasetPreprocess.py | 28 + src/datasetPreprocess/__init__.py | 2 + src/encoder/BaseEncoder.py | 32 + src/encoder/ColPaliEncoder.py | 136 ++++ src/encoder/__init__.py | 2 + src/encoder/sentenceTransformerEncoder.py | 86 +++ src/evaluator/BaseEvaluator.py | 19 + src/evaluator/README.md | 30 + src/evaluator/RagasEvaluator.py | 162 ++++ src/evaluator/RagasOpenAI.py | 108 +++ src/evaluator/Ragasvllm.py | 262 +++++++ src/evaluator/__init__.py | 2 + src/monitoring_sys/__init__.py | 131 ++++ .../config_parser/msys_config_parser.py | 252 +++++++ .../config_parser/resource_identifier/base.py | 24 + .../resource_identifier/this_process.py | 19 + .../resource_identifier/vdb_base.py | 95 +++ .../resource_identifier/vdb_milvus.py | 40 + .../config_parser/resource_identifier/vdbs.py | 0 src/multimodal/PDFPipeline.py | 372 +++++++++ src/multimodal/pdf_parse.py | 114 +++ src/multimodal/structured_parser.py | 13 + src/rag_utils/config.py | 85 +++ src/rag_utils/vec_db.py | 21 + src/run_new.py | 419 +++++++++++ src/ui_client.py | 377 ++++++++++ src/utils/colored_print.py | 103 +++ src/utils/decorator.py | 49 ++ src/utils/env_variable.py | 48 ++ src/utils/logger.py | 380 ++++++++++ src/utils/python_utils.py | 115 +++ src/vectordb/DBInstance.py | 60 ++ src/vectordb/README.md | 136 ++++ src/vectordb/__init__.py | 2 + src/vectordb/chroma_api.py | 320 ++++++++ src/vectordb/elastic_api.py | 304 ++++++++ src/vectordb/lancedb_api.py | 375 ++++++++++ src/vectordb/lancedb_interactive.py | 133 ++++ src/vectordb/milvus_api.py | 412 ++++++++++ src/vectordb/milvus_interactive.py | 265 +++++++ src/vectordb/milvus_util.py | 118 +++ src/vectordb/qdrant_api.py | 314 ++++++++ tests/dataset_test.py | 115 +++ tests/pipeline_test.py | 31 + tests/simple_example.py | 82 ++ 145 files changed, 16295 insertions(+), 2 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 cmake/black_format.cmake create mode 100644 cmake/build_helper.cmake create mode 100644 cmake/clang_format.cmake create mode 100644 cmake/misc.cmake create mode 100644 cmake/module/FindClangFormat.cmake create mode 100644 cmake/python_env.cmake create mode 100644 cmake/third_party.cmake create mode 100644 cmake/utils.cmake create mode 100644 config/README.md create mode 100644 config/example_elastic.yaml create mode 100644 config/example_qdrant.yaml create mode 100644 config/lance_insert.yaml create mode 100644 config/lance_query.yaml create mode 100644 config/milvus_insert.yaml create mode 100644 config/milvus_query.yaml create mode 100644 config/monitor/example_config.yaml create mode 100644 config/pdfimage/lance_insert_pdfimage.yaml create mode 100644 config/pdfimage/lance_query_pdfimage.yaml create mode 100644 config/pdfimage/milvus_insert_pdfimage.yaml create mode 100644 config/pdfimage/milvus_query_pdfimage.yaml create mode 100644 config/pdftext/lance_insert_pdftext.yaml create mode 100644 config/pdftext/lance_query_pdftext.yaml create mode 100644 doc/figures/ragconfig.png create mode 100644 doc/figures/run.png create mode 100644 example/.gitignore create mode 100644 example/CMakeLists.txt create mode 100644 example/monitoring_sys_lib/common.py create mode 100644 example/monitoring_sys_lib/test_parser.py create mode 100644 example/monitoring_sys_lib/test_parser_new.py create mode 100644 example/monitoring_sys_lib/test_run.py create mode 100644 monitoring_sys/.gitignore create mode 100644 monitoring_sys/CMakeLists.txt create mode 100644 monitoring_sys/README.md create mode 100644 monitoring_sys/include/cpu_meter.hh create mode 100644 monitoring_sys/include/disk_meter.hh create mode 100644 monitoring_sys/include/gpu_meter.hh create mode 100644 monitoring_sys/include/logger.hh create mode 100644 monitoring_sys/include/mem_meter.hh create mode 100644 monitoring_sys/include/meter.hh create mode 100644 monitoring_sys/include/meter.ipp create mode 100644 monitoring_sys/include/msys.hh create mode 100644 monitoring_sys/include/proc_meter.hh create mode 100644 monitoring_sys/include/utils.hh create mode 100644 monitoring_sys/include/utils.ipp create mode 100644 monitoring_sys/msys_defs.cmake create mode 100644 monitoring_sys/src/cpu_meter.cc create mode 100644 monitoring_sys/src/disk_meter.cc create mode 100644 monitoring_sys/src/gpu_meter.cc create mode 100644 monitoring_sys/src/interface.cc create mode 100644 monitoring_sys/src/logger.cc create mode 100644 monitoring_sys/src/mem_meter.cc create mode 100644 monitoring_sys/src/meter.cc create mode 100644 monitoring_sys/src/msys.cc create mode 100644 monitoring_sys/src/proc_meter.cc create mode 100644 monitoring_sys/src/utils.cc create mode 100644 parser.py create mode 100644 req.txt create mode 100644 resource/.gitignore create mode 100644 resource/bash_utils.sh create mode 100644 resource/black_format/.black-format create mode 100644 resource/build_helper/libclang_get_lib_version.py create mode 100644 resource/build_helper/py3_require_executable_module.py create mode 100644 resource/build_helper/py3_require_package.py create mode 100644 resource/clang_format/.clang-format create mode 100644 resource/clang_format/run_clang_format.py create mode 100644 resource/proto/cpu_metrics.proto create mode 100644 resource/proto/disk_metrics.proto create mode 100644 resource/proto/gpu_metrics.proto create mode 100644 resource/proto/mem_metrics.proto create mode 100644 resource/proto/proc_metrics.proto create mode 100644 resource/requirements.in create mode 100644 resource/setup.sh create mode 100755 script/run_insert.sh create mode 100644 src/.gitignore create mode 100644 src/RAGPipeline/BaseRAGPipline.py create mode 100644 src/RAGPipeline/ImageRAGPipline.py create mode 100644 src/RAGPipeline/TextsRAGPipline.py create mode 100644 src/RAGPipeline/__init__.py create mode 100644 src/RAGPipeline/reranker/BaseReranker.py create mode 100644 src/RAGPipeline/reranker/CrossEncoderReranker.py create mode 100644 src/RAGPipeline/reranker/__init__.py create mode 100644 src/RAGPipeline/responser/BaseResponser.py create mode 100644 src/RAGPipeline/responser/ImagesResponser.py create mode 100644 src/RAGPipeline/responser/TextsResponser.py create mode 100644 src/RAGPipeline/responser/__init__.py create mode 100644 src/RAGPipeline/retriever/BaseRetriever.py create mode 100644 src/RAGPipeline/retriever/__init__.py create mode 100644 src/RAGRequest/BaseRAGRequest.py create mode 100644 src/RAGRequest/TextsRAGRequest.py create mode 100644 src/RAGRequest/__init__.py create mode 100644 src/__init__.py create mode 100644 src/config.py create mode 100644 src/datasetLoader/BaseDatasetLoader.py create mode 100644 src/datasetLoader/PDFDatasetLoader.py create mode 100644 src/datasetLoader/TextDatasetLoader.py create mode 100644 src/datasetLoader/__init__.py create mode 100644 src/datasetPreprocess/BaseDatasetPreprocess.py create mode 100644 src/datasetPreprocess/PDFDatasetPreprocess.py create mode 100644 src/datasetPreprocess/TextDatasetPreprocess.py create mode 100644 src/datasetPreprocess/__init__.py create mode 100644 src/encoder/BaseEncoder.py create mode 100644 src/encoder/ColPaliEncoder.py create mode 100644 src/encoder/__init__.py create mode 100644 src/encoder/sentenceTransformerEncoder.py create mode 100644 src/evaluator/BaseEvaluator.py create mode 100644 src/evaluator/README.md create mode 100644 src/evaluator/RagasEvaluator.py create mode 100644 src/evaluator/RagasOpenAI.py create mode 100644 src/evaluator/Ragasvllm.py create mode 100644 src/evaluator/__init__.py create mode 100644 src/monitoring_sys/__init__.py create mode 100644 src/monitoring_sys/config_parser/msys_config_parser.py create mode 100644 src/monitoring_sys/config_parser/resource_identifier/base.py create mode 100644 src/monitoring_sys/config_parser/resource_identifier/this_process.py create mode 100644 src/monitoring_sys/config_parser/resource_identifier/vdb_base.py create mode 100644 src/monitoring_sys/config_parser/resource_identifier/vdb_milvus.py create mode 100644 src/monitoring_sys/config_parser/resource_identifier/vdbs.py create mode 100644 src/multimodal/PDFPipeline.py create mode 100644 src/multimodal/pdf_parse.py create mode 100644 src/multimodal/structured_parser.py create mode 100644 src/rag_utils/config.py create mode 100644 src/rag_utils/vec_db.py create mode 100644 src/run_new.py create mode 100644 src/ui_client.py create mode 100644 src/utils/colored_print.py create mode 100644 src/utils/decorator.py create mode 100644 src/utils/env_variable.py create mode 100644 src/utils/logger.py create mode 100644 src/utils/python_utils.py create mode 100644 src/vectordb/DBInstance.py create mode 100644 src/vectordb/README.md create mode 100644 src/vectordb/__init__.py create mode 100644 src/vectordb/chroma_api.py create mode 100644 src/vectordb/elastic_api.py create mode 100644 src/vectordb/lancedb_api.py create mode 100644 src/vectordb/lancedb_interactive.py create mode 100644 src/vectordb/milvus_api.py create mode 100644 src/vectordb/milvus_interactive.py create mode 100644 src/vectordb/milvus_util.py create mode 100644 src/vectordb/qdrant_api.py create mode 100644 tests/dataset_test.py create mode 100644 tests/pipeline_test.py create mode 100644 tests/simple_example.py diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f933f6c --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,96 @@ +cmake_minimum_required(VERSION 3.22) + +project(rag_sys LANGUAGES C CXX) + +# === CMake Options === +option(FORMATTING_ONLY + "Only generate format related targets" OFF) +option(GENERATE_GLOBAL_PY3_DEPENDENCY + "Generate global Python3 package requirements, only asserted when excluding env-specific packages" OFF) +option(GENERATE_ESSENTIAL_PY3_DEPENDENCY + "Generate essential Python3 package requirements (i.e., excluding formatting, QoL, etc.)" OFF) + +# === CMake policy === +# make CMAKE_INTERPROCEDURAL_OPTIMIZATION applies globally +set(CMAKE_POLICY_DEFAULT_CMP0069 NEW) + +# === C/C++ standard === +set(CMAKE_C_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) + +# === Cmake configurations === +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_VERBOSE_MAKEFILE OFF) +# build type & optimization +set(CMAKE_BUILD_TYPE Release) +if(CMAKE_BUILD_TYPE STREQUAL "Release") + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) +endif() + +# === Custom configurations === +# Print all target building information formulated using cmake/build_helper.cmake:function(cxx_setup_target) +set(EXPORT_TARGET_CONFIG OFF) + +# === Prioritize executables and libraries in conda environment === +# if(DEFINED ENV{CONDA_PREFIX}) +# execute_process( +# COMMAND [=[echo $SHELL; echo "${CONDA_PREFIX:-'$(dirname $(which conda))/../'}"]=] +# OUTPUT_VARIABLE CONDA_CMAKE_PREFIX_PATH +# COMMAND_ECHO STDOUT +# ECHO_OUTPUT_VARIABLE +# ) +# list(APPEND CMAKE_PREFIX_PATH "${CONDA_CMAKE_PREFIX_PATH}") +# endif() + +# === Custom repository-wide variables === +set(PYTHON_SRC_DIR ${CMAKE_SOURCE_DIR}/src) +set(RESOURCE_DIR ${CMAKE_SOURCE_DIR}/resource) + +# === Create necessary directories === +file(MAKE_DIRECTORY ${RESOURCE_DIR}/generated) + +# === Include custom module subdirectories === +set(CMAKE_MODULE_PATH + ${CMAKE_SOURCE_DIR}/cmake/module + ${CMAKE_MODULE_PATH}) + +# === Include utilities === +# cmake utilities +include(cmake/utils.cmake) + +# python utilities +set(PY3_PKG_EXISTENCE_DIR ${CMAKE_BINARY_DIR}/py3_pkg_info) +set(PY3_PKGDEP_CHK_SCRIPT ${RESOURCE_DIR}/build_helper/py3_require_package.py) +set(PY3_EXEMOD_CHK_SCRIPT ${RESOURCE_DIR}/build_helper/py3_require_executable_module.py) +include(cmake/python_env.cmake) + +# === Code Formatting configuration === +set(CLANG_FORMAT_DIR ${RESOURCE_DIR}/clang_format) +set(BLACK_FORMAT_DIR ${RESOURCE_DIR}/black_format) +include(cmake/clang_format.cmake) +include(cmake/black_format.cmake) +if(FORMATTING_ONLY) + message(STATUS "Only running clang-format, skipping other configurations.") + return() +endif() + +# === Include other predefined cmake files === +# c++ build helper +include(cmake/build_helper.cmake) + +# misc +set(LIBCLANG_FIND_VERSION_SCRIPT ${RESOURCE_DIR}/build_helper/libclang_get_lib_version.py) +include(cmake/misc.cmake) + +# third-party library import +set(THIRD_PARTY_DIR ${CMAKE_SOURCE_DIR}/third_party) +include(cmake/third_party.cmake) + +# === Custom project-wide variables === + +# === Include subdirectories === +add_subdirectory(monitoring_sys) +add_subdirectory(example) + +# === Status report === +message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") diff --git a/README.md b/README.md index 34b89d2..7f5bdc0 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,161 @@ -# RAGPerf -An End-to-End Benchmarking Framework for Retrieval-Augmented Generation Systems +# RASB: RAG-based AI System Benchmakring Framework + +**RASB** is an open-source framework designed to benchmark the End-to-End system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. + + +[![C/C++ Format Check](https://github.com/IOScience/RAGPipeline/actions/workflows/clang-format.yml/badge.svg)](https://github.com/IOScience/RAGPipeline/actions/workflows/clang-format.yml) +[![Python Format Check](https://github.com/IOScience/RAGPipeline/actions/workflows/black-format.yml/badge.svg)](https://github.com/IOScience/RAGPipeline/actions/workflows/black-format.yml) + + +![CMake](https://img.shields.io/badge/CMake-008fba.svg?style=flat&logo=cmake&logoColor=ffffff) +![C++](https://img.shields.io/badge/c++-00599c.svg?style=flat&logo=c%2B%2B&logoColor=ffffff) +![Python](https://img.shields.io/badge/python-3670a0?style=flat&logo=python&logoColor=ffe465) +![OS Linux](https://img.shields.io/badge/OS-Linux-fcc624?style=flat&logo=linux&logoColor=ffffff) +[![Code style: clang-format](https://img.shields.io/badge/C/C++_Code_Style-clang--format-2a3e50?style=flat&logo=llvm&logoColor=cccccc)](resource/clang_format/.clang-format) +[![Code style: black](https://img.shields.io/badge/Python_Code_Style-black-000000?style=flat&logo=black&logoColor=ffffff)](resource/black_format/.black-format) + +## Features + +**🚀 Holistic System-Centric Benchmarking**: RASB moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdown (retrieval vs. generation), and hardware efficiency, helping you identify whether a bottleneck lies in I/O-bound retrieval or compute-bound prefill/decoding stages. + +**🧩 Modular Architecture**: RASB employs a configuration-driven design that abstracts the entire RAG pipeline—Embedding, Vector Database, Reranking, and Generation—behind uniform interfaces. You can seamlessly swap components—switching from Milvus to LanceDB, or from vLLM to OpenAI APIs—without rewriting code. This enables fine-grained analysis of specific component trade-offs. + +**📊 Detailed Full-Stack Profiling**: RASB integrates a lightweight system profiler that runs as a background daemon. It captures granular hardware metrics with minimal overhead, including GPU/CPU utilization, memory hierarchy pressure (host RAM vs. GPU VRAM), PCIe throughput, and Disk I/O. This allows for deep analysis of resource contention between the VectorDB and LLM. + +**🔄 Dynamic Workload Generation**: Simulates the evolution of real-world knowledge bases. The workload generator can interleave standard search queries with insert, update, and delete operations. This allows you to stress-test how a RAG system handles high-concurrency requests while maintaining data freshness. + +**🖼️ Multi-Modal Capabilities**: RASB supports diverse data modalities beyond plain text. It includes specialized pipelines for Visual RAG (PDFs, Images) using OCR or ColPali visual embeddings, and Audio RAG using ASR models like Whisper. This enables benchmarking of complex, unstructured enterprise data pipelines. + +--- + + +## Table of Contents + +- [RASB: RAG-based AI System Benchmakring Framework](#rasb-rag-based-ai-system-benchmakring-framework) + - [Unique Features](#unique-features) + - [Installation](#installation) + - [1) Create a virtual environment](#1-create-a-virtual-environment) + - [2) Python dependencies](#2-python-dependencies) + - [3) Install monitor system](#3-install-monitor-system) + - [Usage](#usage) + - [Quick Start with Web UI](#quick-start-with-web-ui) + - [1) Preparation](#1-preparation) + - [2) Config your Benchmark and run](#2-config-your-benchmark-and-run) + - [Run with Command Line (CLI)](#run-with-command-line-cli) + - [1) Preparation](#1-preparation-1) + - [2) Running the Benchmark](#2-running-the-benchmark) + - [3) Output Analysis](#3-output-analysis) + - [Supported RAG Pipeline Modules](#supported-rag-pipeline-modules) + - [VectorDB](#vectordb) + - [Monitoring System](#monitoring-system) + - [Customized Modules](#customized-modules) + +## Installation +We highly recommend using an isolated Python environment (Conda). + +### 1) Create a virtual environment + +**Conda (recommended)** +```bash +# Install Miniconda/Mambaforge from the official site if you don't have Conda +conda create -n ragbench python=3.10 +conda activate ragbench +``` + +### 2) Python dependencies +Execute the following instructions to install all the dependencies for the project. +We use `pip-tools` to ensure reproducible dependency resolution. + +```bash +# install pip-compile for python package dependency resolution +python3 -m pip install pip-tools + +# configure MSys and generate a list of all required python packages +mkdir build && cd build +cmake .. +make generate_py3_requirements +python3 -m pip install -r ../requirement.txt +``` + +### 3) Install monitor system + +RASB uses a custom, low-overhead monitoring daemon. Please refer to the documentations at [MonitoringSystem README](monitoring_sys/README.md) for compilation and installation instructions. + +## Usage +RASB provides an Interactive Web UI for ease of use. Or you can use the Command Line (CLI) for automation. + +### Quick Start with Web UI +#### 1) Preparation +Set these once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell: +```bash +# Make local "src" importable +export PYTHONPATH="$REPO_ROOT/src${PYTHONPATH+:$PYTHONPATH}" + +# Where to cache Hugging Face models (optional, adjust path as needed) +export HF_HOME="/mnt/data/hf_home" +``` +Install streamlit and run the RASB client. +```bash +# install streamlit +python3 -m pip install streamlit +# run RASB +streamlit run ui_client.py +``` +Open the UI with the reported url with your web browser, the default url is `http://localhost:8501`. + +#### 2) Config your Benchmark and run +To run the benchmark, we first need to setup the retriever like a vectorDB. See [vectordb](#vectordb). The in the webpage, customize your own workload setting. ![config](./doc/figures/ragconfig.png) + +Then in the execute page, click execute to execute the workload. You may also need to check the config file before the execution, see [here](./config/README.md) for config explaination. ![config](./doc/figures/run.png) + +### Run with Command Line (CLI) +#### 1) Preparation +Set these once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell: +```bash +# Make local "src" importable +export PYTHONPATH="$REPO_ROOT/src${PYTHONPATH+:$PYTHONPATH}" + +# Where to cache Hugging Face models (optional, adjust path as needed) +export HF_HOME="/mnt/data/hf_home" +``` + +#### 2) Running the Benchmark +To run the benchmark, we first need to setup the retriever like a vectorDB. See [vectordb](#vectordb). Change the db_path to your local vectordb path in config file. +``` +vector_db: + db_path: /mnt/data/vectordb +``` +First run the **preprocess/insert** phase to insert the dataset. + +```bash +# 1) Build/insert into the vector store (LanceDB example) +python3 src/run_new.py \ + --config config/lance_insert.yaml \ + --msys-config config/monitor/example_config.yaml +``` +To execute the **query/evaluate**, run the following: +```bash +# 2) Retreival and Query +python3 src/run_new.py \ + --config config/lance_query.yaml \ + --msys-config config/monitor/example_config.yaml +``` +To customize your own workload setting, you may reference the provided config file within `./config` folder. The detailed parameter are listed [here](config/README.md) + +#### 3) Output Analysis +You can check the output result within the `./output` folder. To visualize the output results, run `python3 example/monitoring_sys_lib/test_parser.py`, the visualized figures will be located within the `./output`. + +## Supported RAG Pipeline Modules + +### VectorDB + +RASB already intergrates with many popular vectorDBs. To setup, check the detailed documentations at [VectorDB README](src/vectordb/README.md) + +Want to add a new DB? Check our RASB API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database) to standardize operations. To add a new database + +### Monitoring System + +Examples of how to use it is documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md) + +### Customized Modules + diff --git a/cmake/black_format.cmake b/cmake/black_format.cmake new file mode 100644 index 0000000..ff1f7e5 --- /dev/null +++ b/cmake/black_format.cmake @@ -0,0 +1,53 @@ +find_package(Python3 COMPONENTS Interpreter REQUIRED) + +set(BLACK_FORMATTER_VERSION_REQUIREMENT ~=25.0) +add_py3_pkg_requirements("black${BLACK_FORMATTER_VERSION_REQUIREMENT}" OPTIONAL) +find_py3_executable_module(black VERSION_REQUIREMENT ${BLACK_FORMATTER_VERSION_REQUIREMENT} VERBOSE off) + +if(NOT black_FOUND) + message(STATUS "[Python Formatting] Matched black not found. Python formatting targets will not be available.") + + add_custom_target(install_black_py3pkg_requirements + COMMAND ${Python3_EXECUTABLE} -m pip install "black${BLACK_FORMATTER_VERSION_REQUIREMENT}" + COMMENT "Installing black${BLACK_FORMATTER_VERSION_REQUIREMENT}" + ) +else() + message(STATUS "[Python Formatting] Using black ${black_VERSION}") + + if (NOT Python3_FOUND) + message(FATAL_ERROR "[Python Formatting] Python3 interpreter not found") + endif() + + find_file(BLACK_FORMAT_FILE .black-format + PATHS "${BLACK_FORMAT_DIR}" + NO_DEFAULT_PATH) + + # format code in place + add_custom_target(python-format + COMMAND ${Python3_EXECUTABLE} -m ${black_MODULE} + --config "${BLACK_FORMAT_FILE}" + --target-version py310 + --skip-string-normalization + --exclude "${CMAKE_SOURCE_DIR}/third_party" + "${CMAKE_SOURCE_DIR}" + COMMAND echo "Black formatting complete" + DEPENDS "${BLACK_FORMAT_FILE}" + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + VERBATIM) + + # check for format violations + add_custom_target(python-check-format + COMMAND ${Python3_EXECUTABLE} -m ${black_MODULE} + --config "${BLACK_FORMAT_FILE}" + --check + --diff + --color + --target-version py310 + --skip-string-normalization + --exclude "${CMAKE_SOURCE_DIR}/third_party" + "${CMAKE_SOURCE_DIR}" + COMMAND echo "Black formatting complete" + DEPENDS "${BLACK_FORMAT_FILE}" + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + VERBATIM) +endif() \ No newline at end of file diff --git a/cmake/build_helper.cmake b/cmake/build_helper.cmake new file mode 100644 index 0000000..8a3fad9 --- /dev/null +++ b/cmake/build_helper.cmake @@ -0,0 +1,173 @@ +# Manages the C/C++ building process and provides helper functions. +# +# Provides: +# Function `proto_compile` to compile protobuf files into C++/Python sources. +# Function `cxx_setup_target` to set up a C/C++ target with sources, includes, and depends. +# Function `cxx_add_executable` to add an executable target. +# Function `cxx_add_static_library` to add a static library target. +# Function `cxx_add_dynamic_library` to add a dynamic library target. +# Function `cxx_add_module` to add a module target. + +include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake) + +message(STATUS "Using C++ Compiler: ${CMAKE_CXX_COMPILER}") + +function(proto_compile name) + cmake_parse_arguments(ARG + "" + "TARGET_NAME;SOURCE_DIR;CXX_DEST_DIR;PY_DEST_DIR;GEN_SOURCES;" + "SOURCES;" + ${ARGN} + ) + + # determine generation language + string(COMPARE NOTEQUAL "${ARG_CXX_DEST_DIR}" "" GEN_CXX) + string(COMPARE NOTEQUAL "${ARG_PY_DEST_DIR}" "" GEN_PY) + if(NOT GEN_CXX AND NOT GEN_PY) + message(FATAL_ERROR "proto_compile did not specify generation directory") + endif() + + # determine all output dirs and protoc generation option + set(PROTOC_OUTPUT_OPTIONS "") + set(PROTOC_OUTPUT_DIRS "") + if(GEN_CXX) + list(APPEND PROTOC_OUTPUT_OPTIONS "--cpp_out=${ARG_CXX_DEST_DIR}") + list(APPEND PROTOC_OUTPUT_DIRS ${ARG_CXX_DEST_DIR}) + endif() + if(GEN_PY) + list(APPEND PROTOC_OUTPUT_OPTIONS "--python_out=${ARG_PY_DEST_DIR}") + list(APPEND PROTOC_OUTPUT_OPTIONS "--pyi_out=${ARG_PY_DEST_DIR}") + list(APPEND PROTOC_OUTPUT_DIRS ${ARG_PY_DEST_DIR}) + endif() + + set(ALL_GENERATED_SOURCES "") + # make the generated sources from respective proto file a group so that they will be generated + # together if proto file updates or one of them is missing for some reason + foreach(PROTO_SOURCE ${ARG_SOURCES}) + set(SOURCE_COMPILED "") + set(HEADER_COMPILED "") + set(GENERATED_SOURCES "") + set(GENERATED_HEADERS "") + get_filename_component(PROTO_SOURCE_NAME ${PROTO_SOURCE} NAME_WLE) + # REVIEW: This method of getting protobuf compiled file is under the assumption of how + # protobuf library today (30.2 as of writing) generates the output file names. This might + # subject to change according to cmake documentation on FindProtobuf. + # (https://cmake.org/cmake/help/latest/module/FindProtobuf.html) + # NOTE: Following the convention that all generated files have the same name as source + # files, with cxx output *.pb.cc and *.pb.h, python output *_pb2.py + # generate c++ sources + if(GEN_CXX) + string(CONCAT SOURCE_COMPILED ${ARG_CXX_DEST_DIR} "/" ${PROTO_SOURCE_NAME} ".pb.cc") + string(CONCAT HEADER_COMPILED ${ARG_CXX_DEST_DIR} "/" ${PROTO_SOURCE_NAME} ".pb.h") + list(APPEND GENERATED_SOURCES ${SOURCE_COMPILED}) + list(APPEND GENERATED_HEADERS ${HEADER_COMPILED}) + endif() + # generate python sources + if(GEN_PY) + string(CONCAT SOURCE_COMPILED ${ARG_PY_DEST_DIR} "/" ${PROTO_SOURCE_NAME} "_pb2.py") + list(APPEND GENERATED_SOURCES ${SOURCE_COMPILED}) + endif() + + add_custom_command( + # for language that generate headers, make sure headers are also in output with source + # so they will also be generated if missing + OUTPUT ${GENERATED_SOURCES} ${GENERATED_HEADERS} + COMMAND ${CMAKE_COMMAND} -E make_directory ${PROTOC_OUTPUT_DIRS} + COMMAND protobuf::protoc -I=${ARG_SOURCE_DIR} ${PROTOC_OUTPUT_OPTIONS} ${PROTO_SOURCE} + DEPENDS ${PROTO_SOURCE} + ) + + list(APPEND ALL_GENERATED_SOURCES ${GENERATED_SOURCES}) + endforeach() + + # return only the actual sources since header dependency will be solved by cmake + set(${ARG_GEN_SOURCES} ${ALL_GENERATED_SOURCES} PARENT_SCOPE) +endfunction() + +function(cxx_setup_target name) + cmake_parse_arguments(ARG "" "NAME;TARGET" "SOURCES;INCLUDES;DEPENDS;COPTIONS;LOPTIONS;" ${ARGN}) + + target_include_directories(${name} PUBLIC ${ARG_INCLUDES}) + target_link_libraries(${name} PUBLIC ${ARG_DEPENDS}) + + list(TRANSFORM ARG_LOPTIONS PREPEND "LINKER:") + target_compile_options(${name} PUBLIC ${ARG_COPTIONS}) + target_link_options(${name} PUBLIC ${ARG_LOPTIONS}) + + + if(EXPORT_TARGET_CONFIG) + # FIXME: pretty print indent length hardcoded + pad_string(indent_str "" 12 POST) + string(CONCAT replace_str "\n" "${indent_str}") + # list one source/include/depend per line + string(REPLACE ";" ${replace_str} sources "${ARG_SOURCES}") + string(REPLACE ";" ${replace_str} includes "${ARG_INCLUDES}") + string(REPLACE ";" ${replace_str} depends "${ARG_DEPENDS}") + # list all coptions and loptions in one line + string(REPLACE ";" " " coptions "${ARG_COPTIONS}") + string(REPLACE ";" " " loptions "${ARG_LOPTIONS}") + + string(CONCAT target_config_verbose + "Cmake Location: ${CMAKE_CURRENT_SOURCE_DIR}\n" + " Compile target: ${name}\n" + " Sources: ${sources}\n" + " Includes: ${includes}\n" + " Depends: ${depends}\n" + " Compile Options: ${coptions}\n" + " Link Options: ${loptions}\n" + ) + message(STATUS ${target_config_verbose}) + endif() + + # return target name to caller if ARG_TARGET is specified + if(DEFINED ARG_TARGET AND NOT ${ARG_TARGET} STREQUAL "") + set(${ARG_TARGET} ${name} PARENT_SCOPE) + endif() +endfunction() + +function(cxx_add_executable name) + cmake_parse_arguments(ARG "" "NAME;TARGET" "SOURCES;INCLUDES;DEPENDS;COPTIONS;LOPTIONS;" ${ARGN}) + + set(TARGET_NAME ${name}) + add_executable(${TARGET_NAME} ${ARG_SOURCES}) + cxx_setup_target(${TARGET_NAME} ${ARGN}) + if(DEFINED ARG_TARGET AND NOT ${ARG_TARGET} STREQUAL "") + set(${ARG_TARGET} ${TARGET_NAME} PARENT_SCOPE) + endif() +endfunction() + +function(cxx_add_static_library name) + cmake_parse_arguments(ARG "" "NAME;TARGET" "SOURCES;INCLUDES;DEPENDS;COPTIONS;LOPTIONS;" ${ARGN}) + + set(TARGET_NAME ${name}-stc) + add_library(${TARGET_NAME} MODULE ${ARG_SOURCES}) + cxx_setup_target(${TARGET_NAME} ${ARGN}) + set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME ${name}) + if(DEFINED ARG_TARGET AND NOT ${ARG_TARGET} STREQUAL "") + set(${ARG_TARGET} ${TARGET_NAME} PARENT_SCOPE) + endif() +endfunction() + +function(cxx_add_dynamic_library name) + cmake_parse_arguments(ARG "" "NAME;TARGET" "SOURCES;INCLUDES;DEPENDS;COPTIONS;LOPTIONS;" ${ARGN}) + + set(TARGET_NAME ${name}-dyn) + add_library(${TARGET_NAME} MODULE ${ARG_SOURCES}) + cxx_setup_target(${TARGET_NAME} ${ARGN}) + set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME ${name}) + if(DEFINED ARG_TARGET AND NOT ${ARG_TARGET} STREQUAL "") + set(${ARG_TARGET} ${TARGET_NAME} PARENT_SCOPE) + endif() +endfunction() + +function(cxx_add_module name) + cmake_parse_arguments(ARG "" "NAME;TARGET" "SOURCES;INCLUDES;DEPENDS;COPTIONS;LOPTIONS;" ${ARGN}) + + set(TARGET_NAME ${name}-mod) + add_library(${TARGET_NAME} MODULE ${ARG_SOURCES}) + cxx_setup_target(${TARGET_NAME} ${ARGN}) + set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME ${name}) + if(DEFINED ARG_TARGET AND NOT ${ARG_TARGET} STREQUAL "") + set(${ARG_TARGET} ${TARGET_NAME} PARENT_SCOPE) + endif() +endfunction() diff --git a/cmake/clang_format.cmake b/cmake/clang_format.cmake new file mode 100644 index 0000000..7f93f23 --- /dev/null +++ b/cmake/clang_format.cmake @@ -0,0 +1,71 @@ +# Manage the formatting of C/C++ source code using clang-format. +# +# Required variables: +# ::CLANG_FORMAT_DIR:: +# Directory containing the clang-format configuration files and scripts. +# +# Provides: +# Target `format` to format the code in place. +# Target `check-format` to check for formatting violations. +# +# Target ::cpp-format:: +# Description: +# Formats the code in place using clang-format. +# +# Target ::check-cpp-format:: +# Description: +# Checks for formatting violations using clang-format. + +find_package(Python3 COMPONENTS Interpreter REQUIRED) +# use clang-format to enforce coding styles +# only clang-format version 14 or later supports --style=file: +find_package(ClangFormat 14) + +if(NOT CLANG_FORMAT_FOUND) + message(STATUS "[C/CPP Formatting] Matched clang-format not found. Cpp formatting targets will not be available.") +else() + message(STATUS "[C/CPP Formatting] Using clang-format ${CLANG_FORMAT_VERSION} (at ${CLANG_FORMAT_EXECUTABLE})") + + if (NOT Python3_FOUND) + message(FATAL_ERROR "[C/CPP Formatting] Python3 interpreter not found") + endif() + + # create formatting helper targets + # using third party clang format python wrapper + find_file(RUN_CLANG_FORMAT run_clang_format.py + PATHS "${CLANG_FORMAT_DIR}" + NO_DEFAULT_PATH) + find_file(CLANG_FORMAT_FILE .clang-format + PATHS "${CLANG_FORMAT_DIR}" + NO_DEFAULT_PATH) + file(GLOB CLANG_FORMAT_IGNORE_FILES "${CMAKE_SOURCE_DIR}/.clang-format-ignore") + + if(NOT RUN_CLANG_FORMAT) + message(FATAL_ERROR "[C/CPP Formatting] run_clang_format.py not found. Check for repo integrity.") + endif() + + # format code in place + add_custom_target(cpp-format + COMMAND "${Python3_EXECUTABLE}" "${RUN_CLANG_FORMAT}" + "${CMAKE_SOURCE_DIR}" + --clang-format-executable "${CLANG_FORMAT_EXECUTABLE}" + --clang-format-style-file "${CLANG_FORMAT_FILE}" + --clang-format-ignore "${CLANG_FORMAT_IGNORE_FILES}" + --recursive + --in-place + COMMAND echo "Clang-format complete" + DEPENDS "${RUN_CLANG_FORMAT}" + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}") + + # check for format violations + add_custom_target(cpp-check-format + COMMAND "${Python3_EXECUTABLE}" "${RUN_CLANG_FORMAT}" + "${CMAKE_SOURCE_DIR}" + --clang-format-executable "${CLANG_FORMAT_EXECUTABLE}" + --clang-format-style-file "${CLANG_FORMAT_FILE}" + --clang-format-ignore "${CLANG_FORMAT_IGNORE_FILES}" + --recursive + COMMAND echo "Clang-format check complete" + DEPENDS "${RUN_CLANG_FORMAT}" + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}") +endif() diff --git a/cmake/misc.cmake b/cmake/misc.cmake new file mode 100644 index 0000000..a89442a --- /dev/null +++ b/cmake/misc.cmake @@ -0,0 +1,30 @@ +# === helper functions === +function(get_libclang_sharedlib_version OUTPUT_VAR) + assert_valid_path(LIBCLANG_FIND_VERSION_SCRIPT) + execute_process( + COMMAND ${Python3_EXECUTABLE} ${LIBCLANG_FIND_VERSION_SCRIPT} + OUTPUT_VARIABLE LIBCLANG_VERSION + RESULT_VARIABLE LIBCLANG_VERSION_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + if(NOT LIBCLANG_VERSION_RESULT EQUAL 0) + message(WARNING "Failed to get libclang version from shared library.") + set(${OUTPUT_VAR} "" PARENT_SCOPE) + return() + endif() + + set(${OUTPUT_VAR} "${LIBCLANG_VERSION}" PARENT_SCOPE) +endfunction() + +# === targets === +function(generate_list_targets_target) + set(LIST_TARGET_TARGET_NAME list_targets) + if(NOT TARGET ${LIST_TARGET_TARGET_NAME}) + add_custom_target(${LIST_TARGET_TARGET_NAME} + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target help + COMMENT "List all available targets" + ) + endif() +endfunction() +cmake_language(DEFER DIRECTORY ${CMAKE_SOURCE_DIR} CALL generate_list_targets_target) diff --git a/cmake/module/FindClangFormat.cmake b/cmake/module/FindClangFormat.cmake new file mode 100644 index 0000000..4551c16 --- /dev/null +++ b/cmake/module/FindClangFormat.cmake @@ -0,0 +1,59 @@ +# FindClangFormat.cmake +# +# Tries to find clang-format of a specific version. +# +# Result Variables: +# CLANG_FORMAT_EXECUTABLE — the path to the clang-format binary +# CLANG_FORMAT_FOUND — true if a suitable clang-format was found +# CLANG_FORMAT_VERSION — the version of clang-format found +# +# Cite: https://cmake.org/pipermail/cmake/2014-January/056677.html + +# set search paths for clang-format +string(REPLACE ":" ";" CLANG_FORMAT_SEARCH_PATHS $ENV{PATH}) + +# reset output variables +set(CLANG_FORMAT_FOUND OFF) +unset(CLANG_FORMAT_EXECUTABLE) +unset(CLANG_FORMAT_VERSION) + +# try to find clang-format executable in all search paths +foreach(CLANG_FORMAT_SEARCH_PATH ${CLANG_FORMAT_SEARCH_PATHS}) + file(REAL_PATH "${CLANG_FORMAT_SEARCH_PATH}" CLANG_FORMAT_SEARCH_PATH_REAL EXPAND_TILDE) + file(GLOB CLANG_FORMAT_EXE_LIST ${CLANG_FORMAT_SEARCH_PATH_REAL}/clang-format*) + foreach(CLANG_FORMAT_EXE_CANDIDATE ${CLANG_FORMAT_EXE_LIST}) + # Extract the version number from the output + execute_process( + COMMAND ${CLANG_FORMAT_EXE_CANDIDATE} --version + OUTPUT_VARIABLE CLANG_FORMAT_CANDIDATE_VERSION_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET) + string(REGEX MATCH + "version ([0-9]+\\.[0-9]+\\.[0-9]+)" + _ # discard the full match + "${CLANG_FORMAT_CANDIDATE_VERSION_OUTPUT}") + set(CLANG_FORMAT_CANDIDATE_VERSION "${CMAKE_MATCH_1}") + + # match the version number + if(CLANG_FORMAT_CANDIDATE_VERSION) + # Compare with required version + if(DEFINED CLANG_FORMAT_REQUIRED_VERSION AND + CLANG_FORMAT_CANDIDATE_VERSION VERSION_LESS CLANG_FORMAT_REQUIRED_VERSION) + continue() + endif() + endif() + + # if we reach here, either a version requirement is not set or the candidate version matches + set(CLANG_FORMAT_FOUND ON) + set(CLANG_FORMAT_EXECUTABLE "${CLANG_FORMAT_EXE_CANDIDATE}") + set(CLANG_FORMAT_VERSION "${CLANG_FORMAT_CANDIDATE_VERSION}") + break() + endforeach() +endforeach() + +# standard cmake arguments handling +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ClangFormat + REQUIRED_VARS CLANG_FORMAT_EXECUTABLE CLANG_FORMAT_VERSION + VERSION_VAR CLANG_FORMAT_VERSION +) diff --git a/cmake/python_env.cmake b/cmake/python_env.cmake new file mode 100644 index 0000000..93cf4ff --- /dev/null +++ b/cmake/python_env.cmake @@ -0,0 +1,306 @@ +# Manage Python-related dependencies in the CMake build system. +# +# Provides: +# Function `add_py3_pkg_dependencies` to add Python3 package dependencies to a CMake target. +# Function `add_py3_pkg_requirements` to add global Python3 package requirements. +# Target `generate_py3_requirements` to generate a Python3 requirements file. +# +# Function ::add_py3_pkg_dependencies(TARGET PKG_REQUIREMENTS):: +# Description: +# Adds Python3 package dependencies to a CMake target. +# Arguments: +# - TARGET: The CMake target that requires the Python3 packages. +# - PKG_REQUIREMENTS: List of Python3 package requirements in the format that follows the Python PEP 440 standard. +# Required variables: +# ::PY3_PKG_EXISTENCE_DIR:: +# Directory to store Python3 package existence check files. Presumably a subdirectory of the +# build directory. +# ::PY3_PKGDEP_CHK_SCRIPT:: +# Path to the Python3 script that checks for package requirements. +# +# Function ::find_py3_executable_module(MODULE_NAME)::: +# Description: +# Checks if a Python module is executable (i.e., has a __main__.py file). +# Arguments: +# - MODULE_NAME: The name of the Python module to check. +# Required variables: +# ::PY3_EXEMOD_CHK_SCRIPT:: +# Path to the Python3 script that checks for executable modules. +# Returns: +# - ${MODULE_NAME}_FOUND: True if the module is found, false otherwise. +# - ${MODULE_NAME}_EXECUTABLE: The command to execute the module. +# Note: +# +# Function ::add_py3_pkg_requirements( [ENV_SPECIFIC]):: +# Description: +# Adds a global Python3 package requirement to the build system. +# Arguments: +# - PKG_REQUIREMENTS: List of Python3 package requirement in the format that follows the PEP 440 standard. +# - ENV_SPECIFIC: If set, the package requirement is not added to the global list of requirements. +# +# Target ::generate_py3_requirements:: +# Description: +# Generates a python requirements file by combining all Python3 package requirements. +# +# Note: +# See exactly how version specifiers work at python PEP (PEP 440): +# https://peps.python.org/pep-0440/#version-specifiers +# +# Function ::add_py3_pkg_requirements( [ENV_SPECIFIC]):: +# Description: +# Adds a global Python3 package requirement to the build system. +# Arguments: +# - PKG_REQUIREMENTS: List of Python3 package requirement in the format that follows the PEP 440 standard. +# - ENV_SPECIFIC: If set, the package requirement is not added to the global list of requirements. +# +# Target ::generate_py3_requirements:: +# Description: +# Generates a python requirements file by combining all Python3 package requirements. + +include(${CMAKE_CURRENT_LIST_DIR}/misc.cmake) + +include(${CMAKE_CURRENT_LIST_DIR}/misc.cmake) + +include(${CMAKE_CURRENT_LIST_DIR}/misc.cmake) + +find_package(Python3 COMPONENTS Interpreter REQUIRED) + +function(add_py3_pkg_dependencies target) + cmake_parse_arguments(ARG "" "TARGET" "PKG_REQUIREMENTS;" ${ARGN}) + + if(NOT DEFINED PY3_PKGDEP_CHK_SCRIPT OR NOT EXISTS ${PY3_PKGDEP_CHK_SCRIPT}) + message(FATAL_ERROR "Python3 package dependency check script not found") + endif() + + if(NOT DEFINED PY3_PKG_EXISTENCE_DIR) + message(FATAL_ERROR "Python3 package existence directory not defined") + endif() + + # create the package existence check directory if it does not exist + if(NOT EXISTS "${PY3_PKG_EXISTENCE_DIR}") + file(MAKE_DIRECTORY ${PY3_PKG_EXISTENCE_DIR}) + endif() + assert_valid_path(PY3_PKG_EXISTENCE_DIR) + + foreach(PKG_REQUIREMENT ${ARG_PKG_REQUIREMENTS}) + # convert the requirements string into a valid cmake target name + string(REGEX MATCH + "^([A-Za-z_][A-Za-z0-9_-]*)((~=|==|!=|<=|>=|<|>|===)(.*))?" + PKG_REQUIREMENT_MATCH ${PKG_REQUIREMENT}) + set(PY3PKG_NAME "${CMAKE_MATCH_1}") + set(PY3PKG_CONSTRAINT_FULL "${CMAKE_MATCH_2}") + set(PY3PKG_CONSTRAINT "${CMAKE_MATCH_3}") + set(PY3PKG_VERSION "${CMAKE_MATCH_4}") + + if(${PY3PKG_CONSTRAINT} STREQUAL "~=") + set(PY3PKG_CONSTRAINT "CPEQ") + elseif(${PY3PKG_CONSTRAINT} STREQUAL "==") + set(PY3PKG_CONSTRAINT "EXEQ") + elseif(${PY3PKG_CONSTRAINT} STREQUAL "!=") + set(PY3PKG_CONSTRAINT "NTEQ") + elseif(${PY3PKG_CONSTRAINT} STREQUAL "<=") + set(PY3PKG_CONSTRAINT "LTEQ") + elseif(${PY3PKG_CONSTRAINT} STREQUAL ">=") + set(PY3PKG_CONSTRAINT "GTEQ") + elseif(${PY3PKG_CONSTRAINT} STREQUAL "<") + set(PY3PKG_CONSTRAINT "LT") + elseif(${PY3PKG_CONSTRAINT} STREQUAL ">") + set(PY3PKG_CONSTRAINT "GT") + elseif(${PY3PKG_CONSTRAINT} STREQUAL "===") + set(PY3PKG_CONSTRAINT "ABEQ") + else() + message(FATAL_ERROR "Unsupported package requirement constraint: ${PKG_REQUIREMENT}") + endif() + + string(REPLACE "." "_" PY3PKG_VERSION "${PY3PKG_VERSION}") + set(PKG_REQUIREMENT_TARGET_NAME "PY3PKG_REQ_${PY3PKG_NAME}_${PY3PKG_CONSTRAINT}_${PY3PKG_VERSION}") + + # create a custom target for each package requirement if not already defined + if(NOT TARGET ${PKG_REQUIREMENT_TARGET_NAME}) + set(REQUIREMENT_FNAME + "${PY3_PKG_EXISTENCE_DIR}/${PKG_REQUIREMENT_TARGET_NAME}.ok") + + add_custom_command( + OUTPUT ${REQUIREMENT_FNAME} + # suppress installed package version output + COMMAND ${Python3_EXECUTABLE} ${PY3_PKGDEP_CHK_SCRIPT} ${PKG_REQUIREMENT} >/dev/null + # if previous command failed (package not found), the touch command will never run + COMMAND ${CMAKE_COMMAND} -E touch ${REQUIREMENT_FNAME} + DEPENDS ${Python3_EXECUTABLE} + VERBATIM + ) + + add_custom_target( + ${PKG_REQUIREMENT_TARGET_NAME} + DEPENDS ${REQUIREMENT_FNAME} + COMMENT "Checking for required Python3 package: ${PKG_REQUIREMENT}" + ) + endif() + + # add the package requirement target as a dependency to the specified target + add_dependencies(${target} ${PKG_REQUIREMENT_TARGET_NAME}) + + message(STATUS "Auto-requiring Python3 package: ${PKG_REQUIREMENT_MATCH} for target ${target}") + endforeach() +endfunction() + +set(EXTRA_PY3_PKG_REQUIREMENTS_VAR EXTRA_PY3_PKG_REQUIREMENTS) +set_property(GLOBAL PROPERTY ${EXTRA_PY3_PKG_REQUIREMENTS_VAR} "") + +function(add_py3_pkg_requirements pkg_req) + cmake_parse_arguments(ARG "OPTIONAL;ENV_SPECIFIC" "PKG_REQUIREMENT" "" ${ARGN}) + + if(GENERATE_GLOBAL_PY3_DEPENDENCY AND ARG_ENV_SPECIFIC) + message(STATUS + "GENERATE_GLOBAL_PY3_DEPENDENCY set, global Python3 package requirement ${pkg_req} is not added") + elseif(GENERATE_ESSENTIAL_PY3_DEPENDENCY AND ARG_OPTIONAL) + message(STATUS + "GENERATE_ESSENTIAL_PY3_DEPENDENCY set, global Python3 package requirement: ${pkg_req} is not added") + else() + set_property(GLOBAL APPEND PROPERTY ${EXTRA_PY3_PKG_REQUIREMENTS_VAR} "${pkg_req}") + message(STATUS "Adding global Python3 package requirement: ${pkg_req}") + endif() +endfunction() + +function(generate_py3_requirements) + cmake_parse_arguments(ARG "" "INPUT_FILE;OUTPUT_FILE;" "" ${ARGN}) + + set(GEN_PY3_PKGREQ_TARGET generate_py3_requirements) + find_program(PIP_COMPILE_EXECUTABLE pip-compile) + if(NOT PIP_COMPILE_EXECUTABLE) + message(STATUS + "pip-compile not found, target ${GEN_PY3_PKGREQ_TARGET} will not be available. " + "To install pip-compile, run `python3 -m pip install pip-tools`") + return() + endif() + + # create generated directory at input directory if it does not already exist + get_filename_component(INPUT_DIR ${ARG_INPUT_FILE} DIRECTORY) + cmake_path(APPEND GENERATED_DIR ${INPUT_DIR} "generated") + if(NOT EXISTS ${GENERATED_DIR}) + file(MAKE_DIRECTORY ${GENERATED_DIR}) + endif() + + # write extra package requirements to a generated requirements file + set(EXTRA_REQUIREMENTS_FILE "${GENERATED_DIR}/extra.in") + set(NEW_EXTRA_REQUIREMENTS_FILE "${GENERATED_DIR}/extra.new.in") + get_property(EXTRA_PY3_PKG_REQUIREMENTS GLOBAL PROPERTY ${EXTRA_PY3_PKG_REQUIREMENTS_VAR}) + set(EXTRA_PY3_PKG_REQUIREMENTS_CONTENTS "# === GENERATED BY CMAKE START ===\n") + if(EXTRA_PY3_PKG_REQUIREMENTS) + foreach(EXTRA_PY3_PKG_REQUIREMENT ${EXTRA_PY3_PKG_REQUIREMENTS}) + string(APPEND EXTRA_PY3_PKG_REQUIREMENTS_CONTENTS "${EXTRA_PY3_PKG_REQUIREMENT}\n") + endforeach() + endif() + string(APPEND EXTRA_PY3_PKG_REQUIREMENTS_CONTENTS "# === GENERATED BY CMAKE END ===\n") + file(WRITE ${NEW_EXTRA_REQUIREMENTS_FILE} "${EXTRA_PY3_PKG_REQUIREMENTS_CONTENTS}") + + # prevent cmake generate the target if cmake is run again but no changes are made + if(EXISTS ${EXTRA_REQUIREMENTS_FILE}) + execute_process( + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${NEW_EXTRA_REQUIREMENTS_FILE} ${EXTRA_REQUIREMENTS_FILE} + ) + else() + execute_process( + COMMAND ${CMAKE_COMMAND} -E copy + ${NEW_EXTRA_REQUIREMENTS_FILE} ${EXTRA_REQUIREMENTS_FILE} + ) + endif() + + # merging all requirements, use relative path to avoid absolute path shown in generated file + set(COMBINED_REQUIREMENTS_FILE "${GENERATED_DIR}/combined.in") + cmake_path(RELATIVE_PATH COMBINED_REQUIREMENTS_FILE + BASE_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE COMBINED_REQUIREMENTS_FILE_REL) + cmake_path(RELATIVE_PATH ARG_OUTPUT_FILE + BASE_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE OUTPUT_FILE_REL) + + add_custom_command( + OUTPUT ${ARG_OUTPUT_FILE} + COMMAND ${CMAKE_COMMAND} -E copy ${EXTRA_REQUIREMENTS_FILE} ${COMBINED_REQUIREMENTS_FILE_REL} + COMMAND ${CMAKE_COMMAND} -E cat ${ARG_INPUT_FILE} >> ${COMBINED_REQUIREMENTS_FILE_REL} + COMMAND ${PIP_COMPILE_EXECUTABLE} ${COMBINED_REQUIREMENTS_FILE_REL} + --output-file ${OUTPUT_FILE_REL} + --strip-extras >/dev/null 2>&1 # silence output + DEPENDS ${EXTRA_REQUIREMENTS_FILE} ${ARG_INPUT_FILE} + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + COMMENT "Generating Python3 requirements file via pip-compile, this may take a while" + ) + + add_custom_target(${GEN_PY3_PKGREQ_TARGET} + DEPENDS ${ARG_OUTPUT_FILE} + COMMENT "Generated Python3 requirements file to ${ARG_OUTPUT_FILE}" + ) + + message(STATUS "Python3 requirements file generation destination: ${ARG_OUTPUT_FILE}") +endfunction() + +# include standard cmake arguments handling +include(FindPackageHandleStandardArgs) +# similar to find_package +function(find_py3_executable_module module_name) + cmake_parse_arguments(ARG "REQUIRED;VERBOSE" "MODULE_NAME;VERSION_REQUIREMENT" "" ${ARGN}) + + set(${module_name}_FOUND OFF) + unset(${module_name}_MODULE) + + assert_valid_path(PY3_EXEMOD_CHK_SCRIPT) + execute_process( + COMMAND ${Python3_EXECUTABLE} ${PY3_EXEMOD_CHK_SCRIPT} ${module_name} + RESULT_VARIABLE PYTHON_EXEMOD_FOUND) + + # executable module found + if(PYTHON_EXEMOD_FOUND EQUAL 0) + # get exemod version + execute_process( + COMMAND ${Python3_EXECUTABLE} ${PY3_PKGDEP_CHK_SCRIPT} ${module_name} + OUTPUT_VARIABLE ${module_name}_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + set(${module_name}_FOUND ON) + + if(ARG_VERSION_REQUIREMENT) + # check if the module version matches the requirement + execute_process( + COMMAND ${Python3_EXECUTABLE} ${PY3_PKGDEP_CHK_SCRIPT} "${module_name}${ARG_VERSION_REQUIREMENT}" + OUTPUT_QUIET + RESULT_VARIABLE MODULE_VERSION_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(NOT MODULE_VERSION_RESULT EQUAL 0) + set(${module_name}_FOUND OFF) + endif() + endif() + + if(${module_name}_FOUND) + set(${module_name}_MODULE "${module_name}") + endif() + endif() + + find_package_handle_standard_args(${module_name} + REQUIRED_VARS ${module_name}_FOUND ${module_name}_MODULE ${module_name}_VERSION + VERSION_VAR ${module_name}_VERSION + ) + + if(${module_name}_FOUND) + message(STATUS "Found Python3 executable module: ${${module_name}_MODULE} (version ${${module_name}_VERSION})") + set(${module_name}_FOUND ${${module_name}_FOUND} PARENT_SCOPE) + set(${module_name}_MODULE ${${module_name}_MODULE} PARENT_SCOPE) + set(${module_name}_VERSION ${${module_name}_VERSION} PARENT_SCOPE) + else() + set(SEVERITY WARNING) + if (ARG_REQUIRED) + set(SEVERITY FATAL_ERROR) + elseif (ARG_VERBOSE) + set(SEVERITY STATUS) + endif() + message(${SEVERITY} "Python3 executable module: ${module_name} not found.") + endif() +endfunction() + +# call the function after all other configurations are done +cmake_language(DEFER DIRECTORY ${CMAKE_SOURCE_DIR} CALL generate_py3_requirements + INPUT_FILE ${RESOURCE_DIR}/requirements.in + OUTPUT_FILE ${CMAKE_SOURCE_DIR}/requirement.txt +) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake new file mode 100644 index 0000000..99078b5 --- /dev/null +++ b/cmake/third_party.cmake @@ -0,0 +1,60 @@ +# Manage third-party library imports in the CMake build system. +# +# Required variables: +# ::THIRD_PARTY_DIR:: +# Directory containing third-party libraries. Presumably a subdirectory of the project root. + +# assumes variable THIRD_PARTY_DIR is already set to the path of the third-party directory +assert_valid_path(THIRD_PARTY_DIR) + +# === define submodule import rules === +# pybind11 support +function(add_pybind11) + set(PYBIND11_FOLDER ${THIRD_PARTY_DIR}/pybind11) + # === import options === + # === import === + add_subdirectory(${PYBIND11_FOLDER} third_party/pybind11) + # === src, include, depends, coptions, and loptions === + set(PYBIND11_INCLUDES ${PYBIND11_FOLDER}/include PARENT_SCOPE) +endfunction() + +# protobuf support +function(add_proto) + set(PROTO_FOLDER ${THIRD_PARTY_DIR}/protobuf) + # === import options === + # build abseil as static, do not dynamic link + set(Protobuf_USE_STATIC_LIBS ON) + set(protobuf_BUILD_SHARED_LIBS ON) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + set(BUILD_SHARED_LIBS ON) + # === import === + add_subdirectory(${PROTO_FOLDER} third_party/protobuf) + # === src, include, depends, coptions, and loptions === +endfunction() + +# pre-c++20 time zone info support +function(add_date) + set(DATELIB_FOLDER ${THIRD_PARTY_DIR}/date) + # === import options === + set(USE_SYSTEM_TZ_DB ON) + set(BUILD_TZ_LIB ON) + set(ENABLE_DATE_INSTALL OFF) + # === import === + add_subdirectory(${DATELIB_FOLDER} third_party/date) + # === src, include, depends, coptions, and loptions === +endfunction() + +# === actually import the submodules === +add_pybind11() + +# NOTE: Refer to protobuf version naming here: https://protobuf.dev/support/version-support/ +find_package(Protobuf 6 CONFIG QUIET) +if(${Protobuf_FOUND}) + message(STATUS "Using system protobuf v${Protobuf_VERSION} (at ${Protobuf_DIR})") +else() + # use third-party proto if there is no existing installation + message(STATUS "Using protobuf module in third_party/") + add_proto() +endif() + +add_date() diff --git a/cmake/utils.cmake b/cmake/utils.cmake new file mode 100644 index 0000000..0942fe4 --- /dev/null +++ b/cmake/utils.cmake @@ -0,0 +1,39 @@ +# Helper functions & targets + +# Pad a string to a specified length with spaces +# ::OUTPUT_VAR:: Name of the variable to store the padded string +# ::STR:: The string to pad +# ::LEN:: The target length of the string +# ::LOCATION:: Where to add padding: PRE (before), POST (after) +function(pad_string OUTPUT_VAR STR LEN LOCATION) + string(LENGTH "${STR}" strlen) + + if(strlen LESS ${LEN}) + math(EXPR padding_length "${LEN} - ${strlen}") + string(REPEAT " " ${padding_length} padding) + if(${LOCATION} STREQUAL PRE) + set(STR "${padding}${STR}") + elseif(${LOCATION} STREQUAL POST) + set(STR "${STR}${padding}") + else() + message(FATAL_ERROR "Invalid pad_string LOCATION") + endif() + endif() + + set(${OUTPUT_VAR} "${STR}" PARENT_SCOPE) +endfunction() + +# Asserts that a given path variable is defined and exists +# ::VAR_NAME:: Name of the variable to check +function(assert_valid_path VAR_NAME) + if(NOT DEFINED ${VAR_NAME}) + message(FATAL_ERROR "Path variable '${VAR_NAME}' is not set.") + endif() + + # Use indirect expansion to get the value of the variable + set(VAR_VALUE "${${VAR_NAME}}") + + if(NOT EXISTS "${VAR_VALUE}") + message(FATAL_ERROR "Path '${VAR_VALUE}' (from variable '${VAR_NAME}') does not exist.") + endif() +endfunction() diff --git a/config/README.md b/config/README.md new file mode 100644 index 0000000..7fafcd0 --- /dev/null +++ b/config/README.md @@ -0,0 +1,134 @@ +# RASB Configuration Guide + +This document details the configuration parameters used in the RAG (Retrieval-Augmented Generation) benchmarking pipeline. The configuration file is in YAML format and controls data processing, model selection, hardware allocation, and pipeline execution flow. + +## 1. Top-Level Metadata + +| Parameter | Description | +| :--- | :--- | +| **`run_name`** | A unique identifier for the current experiment (e.g., `default_run`). This is used for naming log files and output directories. | + +--- + +## 2. Benchmark Data Settings (`bench`) + +This section defines the dataset source and how the text/image is pre-processed before ingestion. + +```yaml +bench: + dataset: wikimedia/wikipedia # HuggingFace dataset path or local identifier + type: text # Data modality ('text', 'image') + preprocessing: + chunk_size: 512 # Max tokens/chars per chunk + chunk_overlap: 0 # Overlap between chunks + chunktype: length # Strategy (e.g., 'length') + dataset_ratio: 0.001 # Percentage of dataset to use (0.001 = 0.1%) +``` + +--- + +## 3. RAG Pipeline Configuration (`rag`) + +These settings control which stages of the pipeline run and the specific parameters for each component. + +### 3.1 Pipeline Actions (`action`) +Boolean flags to enable or disable specific pipeline stages. This allows running only insertion, only retrieval, or a full end-to-end test. + +```yaml +rag: + action: + preprocess: true # Enable data chunking/loading + embedding: true # Enable vector embedding generation + insert: true # Enable insertion into VectorDB + build_index: true # Enable index creation (IVF, HNSW, etc.) + retrieval: false # Enable vector search + reranking: false # Enable cross-encoder reranking + generation: false # Enable LLM response generation + evaluate: false # Enable RAGAS evaluation +``` + +### 3.2 Embedding (`embedding`) +Configuration for the model that converts text/images into vectors. + +| Parameter | Description | +| :--- | :--- | +| `device` | GPU device identifier (e.g., `cuda:0`). | +| `sentence_transformers_name` | Name of the model (e.g., `all-MiniLM-L6-v2`, `vidore/colpali-v1.2`). | +| `batch_size` | Number of items processed per batch during embedding. | +| `embedding_framework` | Backend framework (e.g., `sentence_transformers`). | + +### 3.3 Vector Database Operations (`insert`, `build_index`) +Parameters for writing data and creating efficient search structures. + +```yaml +rag: + insert: + batch_size: 512 # Number of vectors inserted per transaction + collection_name: '' # Optional override for collection name + drop_previous_collection: false + build_index: + index_type: IVF_HNSW_SQ # Type of index (IVF_PQ, HNSW, FLAT, etc.) + metric_type: L2 # Distance metric (L2, IP, COSINE) +``` + +### 3.4 Retrieval & Reranking (`retrieval`, `reranking`) +Controls the search phase. + +```yaml +rag: + retrieval: + question_num: 16 # Number of queries to run + retrieval_batch_size: 4 # Batch size for querying VectorDB + top_k: 10 # Number of results to fetch per query + reranking: + device: cuda:0 + rerank_model: Qwen/Qwen2.5-7B-Instruct # Model used for reranking + top_n: 5 # Number of results to keep after reranking +``` + +### 3.5 Generation (`generation`) +Settings for the Large Language Model (LLM) that generates the final answer. + +| Parameter | Description | +| :--- | :--- | +| `device` | GPU device identifier. | +| `model` | Path or name of the LLM (e.g., `Qwen/Qwen2.5-7B-Instruct`). | + +### 3.6 Evaluation (`evaluate`) +Settings for automated quality assessment (e.g., using RAGAS). + +| Parameter | Description | +| :--- | :--- | +| `evaluator_model` | Model used as the judge for metrics like faithfulness. | + +--- + +## 4. System Configuration (`sys`) + +Configures backend infrastructure, hardware allocation, and logging. + +### 4.1 Vector Database (`vector_db`) +Connection details for the vector store backend. + +```yaml +sys: + vector_db: + type: lancedb # Backend type: 'lancedb', 'milvus', 'qdrant', 'elastic' + db_path: /path/to/db # File path (LanceDB) or URL (Milvus/Qdrant) + collection_name: 'test_col' # Name of the collection/table + drop_previous_collection: false +``` + +### 4.2 Devices (`devices`) + +| Parameter | Description | +| :--- | :--- | +| `cpu` | CPU identifier. | +| `gpu_count` | Number of GPUs available to the system. | +| `gpus` | List of specific GPU IDs (e.g., `["cuda:0", "cuda:1"]`). | + +### 4.3 Logging (`log`) + +| Parameter | Description | +| :--- | :--- | +| `metrics_log` | Path for the main execution log file. | \ No newline at end of file diff --git a/config/example_elastic.yaml b/config/example_elastic.yaml new file mode 100644 index 0000000..ef156fe --- /dev/null +++ b/config/example_elastic.yaml @@ -0,0 +1,65 @@ +bench: + dataset: wikimedia/wikipedia + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.0001 +rag: + action: + build_index: true + embedding: true + evaluate: false + generation: false + insert: true + preprocess: true + reranking: false + retrieval: true + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + sentence_transformers_name: all-MiniLM-L6-v2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + evaluate: + evaluator_embedding: ragdata/bge-large-zh-v1.5 + evaluator_model: ragdata/Qwen2-7B-Instruct-GPTQ-Int8 + generation: + device: cuda:0 + model: Qwen/Qwen2.5-7B-Instruct + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false + reranking: + device: cuda:0 + rerank_model: Qwen/Qwen2.5-7B-Instruct + top_n: 5 + retrieval: + question_num: 16 + retrieval_batch_size: 2 + top_k: 5 +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + # collection_name: 'wikimedia_wikipedia_all_MiniLM_L6_v2_1' + # collection_name: 'wikimedia_wikipedia_all_MiniLM_L6_v2_0_1_512' #IVF_FLAT + # collection_name: 'wikimedia_wikipedia_All_mpnet_base_v2_0_1_512' #DISKANN + # collection_name: 'wikimedia_wikipedia_Alibaba_NLP_gte_large_en_v1_5_0_1_512' #GPU_IVF + collection_name: 'wikimedia_wikipedia_all_MiniLM_L6_v2_1' #GPU_IVF + db_path: http://localhost:9200 + db_token: + drop_previous_collection: false + type: elastic diff --git a/config/example_qdrant.yaml b/config/example_qdrant.yaml new file mode 100644 index 0000000..8e0b00c --- /dev/null +++ b/config/example_qdrant.yaml @@ -0,0 +1,65 @@ +bench: + dataset: wikimedia/wikipedia + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.0001 +rag: + action: + build_index: true + embedding: true + evaluate: false + generation: false + insert: true + preprocess: true + reranking: false + retrieval: true + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + sentence_transformers_name: all-MiniLM-L6-v2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + evaluate: + evaluator_embedding: ragdata/bge-large-zh-v1.5 + evaluator_model: ragdata/Qwen2-7B-Instruct-GPTQ-Int8 + generation: + device: cuda:0 + model: Qwen/Qwen2.5-7B-Instruct + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false + reranking: + device: cuda:0 + rerank_model: Qwen/Qwen2.5-7B-Instruct + top_n: 5 + retrieval: + question_num: 16 + retrieval_batch_size: 2 + top_k: 5 +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + # collection_name: 'wikimedia_wikipedia_all_MiniLM_L6_v2_1' + # collection_name: 'wikimedia_wikipedia_all_MiniLM_L6_v2_0_1_512' #IVF_FLAT + # collection_name: 'wikimedia_wikipedia_All_mpnet_base_v2_0_1_512' #DISKANN + # collection_name: 'wikimedia_wikipedia_Alibaba_NLP_gte_large_en_v1_5_0_1_512' #GPU_IVF + collection_name: 'wikimedia_wikipedia_all_MiniLM_L6_v2_1' #GPU_IVF + db_path: http://localhost:6333 + db_token: + drop_previous_collection: false + type: qdrant diff --git a/config/lance_insert.yaml b/config/lance_insert.yaml new file mode 100644 index 0000000..683eff0 --- /dev/null +++ b/config/lance_insert.yaml @@ -0,0 +1,50 @@ +bench: + dataset: wikimedia/wikipedia + type: text + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.1 +rag: + action: + preprocess: true + embedding: false + insert: true + build_index: true + reranking: false + retrieval: false + generation: false + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + device: cuda:0 + sentence_transformers_name: all-MiniLM-L6-v2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: true + filepath: /home/shaobol2/RAGPipeline/wiki_entire.pickle + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'lance_text_test' + db_path: /mnt/data1/shaobol2/lancedb # local path for lance db + db_token: + drop_previous_collection: false + type: lancedb \ No newline at end of file diff --git a/config/lance_query.yaml b/config/lance_query.yaml new file mode 100644 index 0000000..10fda4f --- /dev/null +++ b/config/lance_query.yaml @@ -0,0 +1,60 @@ +bench: + dataset: wikimedia/wikipedia + type: text + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.001 +rag: + action: + preprocess: false + embedding: false + insert: false + build_index: false + reranking: false + retrieval: true + generation: true + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + evaluate: + evaluator_model: Qwen/Qwen2-7B-Instruct-GPTQ-Int8 + embedding: + device: cuda:0 + sentence_transformers_name: all-MiniLM-L6-v2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + generation: + device: cuda:0 + model: Qwen/Qwen2.5-7B-Instruct + reranking: + device: cuda:0 + rerank_model: Qwen/Qwen2.5-7B-Instruct + top_n: 5 + retrieval: + question_num: 512 + retrieval_batch_size: 64 + top_k: 10 + pipeline: + batch_size: 64 +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'lance_text_test' + db_path: /mnt/data1/shaobol2/lancedb # local path for lance db + db_token: + drop_previous_collection: false + type: lancedb diff --git a/config/milvus_insert.yaml b/config/milvus_insert.yaml new file mode 100644 index 0000000..2e5f029 --- /dev/null +++ b/config/milvus_insert.yaml @@ -0,0 +1,49 @@ +bench: + dataset: wikimedia/wikipedia + type: text + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.001 +rag: + action: + preprocess: true + embedding: true + insert: true + build_index: true + reranking: false + retrieval: false + generation: false + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + device: cuda:0 + sentence_transformers_name: all-MiniLM-L6-v2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'milvus_test' + db_path: http://localhost:19530 + db_token: root:Milvus + drop_previous_collection: false + type: milvus diff --git a/config/milvus_query.yaml b/config/milvus_query.yaml new file mode 100644 index 0000000..4b62e80 --- /dev/null +++ b/config/milvus_query.yaml @@ -0,0 +1,58 @@ +bench: + dataset: wikimedia/wikipedia + type: text + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.001 +rag: + action: + preprocess: false + embedding: false + insert: false + build_index: false + reranking: true + retrieval: true + generation: true + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + device: cuda:0 + sentence_transformers_name: all-MiniLM-L6-v2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + generation: + device: cuda:0 + model: Qwen/Qwen2.5-7B-Instruct + reranking: + device: cuda:0 + rerank_model: Qwen/Qwen2.5-7B-Instruct + top_n: 5 + retrieval: + question_num: 16 + retrieval_batch_size: 4 + top_k: 10 + pipeline: + batch_size: 4 +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'lance_text_test' + db_path: /mnt/data1/yuanxu4/lancedb # local path for lance db + db_token: + drop_previous_collection: false + type: lancedb \ No newline at end of file diff --git a/config/monitor/example_config.yaml b/config/monitor/example_config.yaml new file mode 100644 index 0000000..710c472 --- /dev/null +++ b/config/monitor/example_config.yaml @@ -0,0 +1,48 @@ +MSys: + system: + output_dir: ${{ pylogger.log_dirpath }} + default_sample_period_ms: 100 + meter: + - type: CPUMeter + # name: CPU Monitoring + - type: DiskMeter + # name: Disk Monitoring + devices: + ${{ - this_process.used_disks }} + ${{ - vdbs.used_disks }} + - type: MemMeter + # name: Memory Monitoring + probes: + - ${{ mem_mon.probe.MEM_BASIC }} + - ${{ mem_mon.probe.MEM_KERNEL_CACHE }} + - ${{ mem_mon.probe.MEM_ACTIVE_INACTIVE }} + - ${{ mem_mon.probe.MEM_SWAP }} + - ${{ mem_mon.probe.MEM_DIRTY_WRITEBACK }} + - ${{ mem_mon.probe.MEM_TYPE }} + - ${{ mem_mon.probe.MEM_DIRECT_MAP }} + - type: GPUMeter + # name: GPU Monitoring + gpu_ids: + ${{ - gpus.all_gpus }} + nvml_metrics: [] + gpm_metrics: + - ${{ gpu_mon.probe.GPM_SM_UTIL }} + - ${{ gpu_mon.probe.GPM_SM_OCCUPANCY }} + - ${{ gpu_mon.probe.GPM_PCIE_TX_PER_SEC }} + - ${{ gpu_mon.probe.GPM_PCIE_RX_PER_SEC }} + - ${{ gpu_mon.probe.GPM_DRAM_BW_UTIL }} + - ${{ gpu_mon.probe.GPM_INTEGER_UTIL }} + - ${{ gpu_mon.probe.GPM_FP16_UTIL }} + - ${{ gpu_mon.probe.GPM_FP32_UTIL }} + - ${{ gpu_mon.probe.GPM_FP64_UTIL }} + - ${{ gpu_mon.probe.GPM_ANY_TENSOR_UTIL }} + - ${{ gpu_mon.probe.GPM_GRAPHICS_UTIL }} + - type: ProcMeter + # name: Process Monitoring + pids: + ${{ - this_process.pids }} + ${{ - vdbs.pids }} + probes: + - ${{ proc_mon.probe.STAT }} + - ${{ proc_mon.probe.STATM }} + - ${{ proc_mon.probe.IO }} diff --git a/config/pdfimage/lance_insert_pdfimage.yaml b/config/pdfimage/lance_insert_pdfimage.yaml new file mode 100644 index 0000000..f20640f --- /dev/null +++ b/config/pdfimage/lance_insert_pdfimage.yaml @@ -0,0 +1,49 @@ +bench: + dataset: common-pile/arxiv_papers + type: image + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.0001 +rag: + action: + preprocess: true + embedding: true + insert: true + build_index: true + reranking: false + retrieval: false + generation: false + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + device: cuda:0 + sentence_transformers_name: vidore/colpali-v1.2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'lance_image_test' + db_path: /mnt/data1/yuanxu4/lancedb # local path for lance db + db_token: + drop_previous_collection: false + type: lancedb diff --git a/config/pdfimage/lance_query_pdfimage.yaml b/config/pdfimage/lance_query_pdfimage.yaml new file mode 100644 index 0000000..2f64dd1 --- /dev/null +++ b/config/pdfimage/lance_query_pdfimage.yaml @@ -0,0 +1,58 @@ +bench: + dataset: common-pile/arxiv_papers + type: image + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.0001 +rag: + action: + preprocess: false + embedding: false + insert: false + build_index: false + reranking: false + retrieval: true + generation: true + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + device: cuda:0 + sentence_transformers_name: vidore/colpali-v1.2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false + generation: + device: cuda:0 + model: Qwen/Qwen2-VL-7B-Instruct + retrieval: + question_num: 4 + retrieval_batch_size: 1 + top_k: 2 + pipeline: + batch_size: 1 +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'lance_image_test' + db_path: /mnt/data1/yuanxu4/lancedb # local path for lance db + db_token: + drop_previous_collection: false + type: lancedb diff --git a/config/pdfimage/milvus_insert_pdfimage.yaml b/config/pdfimage/milvus_insert_pdfimage.yaml new file mode 100644 index 0000000..1865b5b --- /dev/null +++ b/config/pdfimage/milvus_insert_pdfimage.yaml @@ -0,0 +1,49 @@ +bench: + dataset: common-pile/arxiv_papers + type: image + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.0001 +rag: + action: + preprocess: true + embedding: true + insert: true + build_index: true + reranking: false + retrieval: false + generation: false + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + device: cuda:0 + sentence_transformers_name: vidore/colpali-v1.2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'image_text_test' + db_path: http://localhost:19530 + db_token: root:Milvus + drop_previous_collection: false + type: milvus diff --git a/config/pdfimage/milvus_query_pdfimage.yaml b/config/pdfimage/milvus_query_pdfimage.yaml new file mode 100644 index 0000000..9def8f7 --- /dev/null +++ b/config/pdfimage/milvus_query_pdfimage.yaml @@ -0,0 +1,58 @@ +bench: + dataset: common-pile/arxiv_papers + type: image + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.0001 +rag: + action: + preprocess: false + embedding: false + insert: false + build_index: false + reranking: false + retrieval: true + generation: true + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + device: cuda:0 + sentence_transformers_name: vidore/colpali-v1.2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false + generation: + device: cuda:0 + model: Qwen/Qwen2-VL-7B-Instruct + retrieval: + question_num: 2 + retrieval_batch_size: 2 + top_k: 2 + pipeline: + batch_size: 2 +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'image_text_test' + db_path: http://localhost:19530 + db_token: root:Milvus + drop_previous_collection: false + type: milvus diff --git a/config/pdftext/lance_insert_pdftext.yaml b/config/pdftext/lance_insert_pdftext.yaml new file mode 100644 index 0000000..5dcb77c --- /dev/null +++ b/config/pdftext/lance_insert_pdftext.yaml @@ -0,0 +1,49 @@ +bench: + dataset: common-pile/arxiv_papers + type: text + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.0001 +rag: + action: + preprocess: true + embedding: true + insert: true + build_index: true + reranking: false + retrieval: false + generation: false + evaluate: false + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + embedding: + device: cuda:0 + sentence_transformers_name: all-MiniLM-L6-v2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + insert: + batch_size: 512 + collection_name: '' + drop_previous_collection: false +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'lance_insert_pdftext_test' + db_path: /mnt/data1/yuanxu4/lancedb # local path for lance db + db_token: + drop_previous_collection: false + type: lancedb diff --git a/config/pdftext/lance_query_pdftext.yaml b/config/pdftext/lance_query_pdftext.yaml new file mode 100644 index 0000000..670c2f4 --- /dev/null +++ b/config/pdftext/lance_query_pdftext.yaml @@ -0,0 +1,60 @@ +bench: + dataset: common-pile/arxiv_papers + type: text + preprocessing: + chunk_overlap: 0 + chunk_size: 512 + chunktype: length + dataset_ratio: 0.0001 +rag: + action: + preprocess: false + embedding: false + insert: false + build_index: false + reranking: true + retrieval: true + generation: true + evaluate: true + build_index: + index_type: IVF_HNSW_SQ + metric_type: L2 + evaluate: + evaluator_model: Qwen/Qwen2-7B-Instruct-GPTQ-Int8 + embedding: + device: cuda:0 + sentence_transformers_name: all-MiniLM-L6-v2 + batch_size: 1024 + embedding_framework: sentence_transformers + model: nomic-ai/nomic-embed-text-v2-moe + store: false + load: false + generation: + device: cuda:0 + model: Qwen/Qwen2.5-7B-Instruct + reranking: + device: cuda:0 + rerank_model: Qwen/Qwen2.5-7B-Instruct + top_n: 5 + retrieval: + question_num: 16 + retrieval_batch_size: 4 + top_k: 10 + pipeline: + batch_size: 4 +run_name: default_run +sys: + devices: + cpu: cpu + gpu_count: 2 + gpus: + - cuda:0 + - cuda:1 + log: + metrics_log: ./log/default_run.log + vector_db: + collection_name: 'lance_insert_pdftext_test' + db_path: /mnt/data1/yuanxu4/lancedb # local path for lance db + db_token: + drop_previous_collection: false + type: lancedb diff --git a/doc/figures/ragconfig.png b/doc/figures/ragconfig.png new file mode 100644 index 0000000000000000000000000000000000000000..6db5d09ce85a53799ee4e25d57efe4769cba6740 GIT binary patch literal 109898 zcmd>lRa9I}w=D$s;0{T!;O@{Q!JPzmcXzj70U8Jpw2=@965L%HcMtCF4h=MN`#bmi z_v1X?w>!qI7Y4hlYS*s4YOb~BnmbZONg4~C3>^**4og->;tL!c@&p_lA|om?>`FcS zyCK*gMAuKUYN)X0hidi>_CKkcq?VhiqlKHNv5PsJrGulrIjgIwi@CXjtCgeMDMC8{ z)`|6>PU0@+#%|V*4sX<~?akq&T&&-42)>asc6h_Z!NvE6lS7C@K!}t7%}2#I;;L#T zOL{YKaBtvbB|fToWgIR0x~W~aK0ogc({eWs(*~2G%A(4Cm!Z|8eVy|oKT36f4x7^o?5-(aFd6?D93EnNgm%x8H!@ktp^cUm5$?&63ycjPb(e(Kn z{mU69{~uU_qDo-a(kW!~^6m`!_$UYjaFWH4U?O88iwELJzFa+9bpXahivc??FRz-q z`o#1!&(9E^+zA;82}EaSXG~Hq#N5wFelH(y)dD$CEKd`V9hWN|=kjpt><3s)0z67` zrzv1k12XdRXpyY{M}L#Y1jI}c@2WBQ^s(LCqN4q(F#F8R%m%$O!1d|1kD$j9No{Ry ziaE>6QCu8X^4~P{^_O>5jH`xX^ta+iLs`>_%r^B3dj&Wvw?lou9)lUtdQ=LE(1WQ^^?Bfjx#Z zFaOl|G;r*crrXZKNsZRn*tom9`!y96_6JyRK=x^|9Pdl3zq>hvydF8cuJVU^0HyH7 z?kDUrlo-_2)qkj|xvp7}9y{RNHZPrb#!{$Ht^(ZGQ4PzH={A% z?wJvWk7Bm~sJD5Qdh_(6so%4*!sbkjKYuJ|>T8My0=J;6&j4ILa_Km#p5y(WH3x`& zf0X?we`ZcDE%aJKan|kZOiKF1ot!ulC#N{G$O^t?fSU7D#kbj(Rvd(f`4YY)I;TR- zwO&{RaS3c$LF-+cdq+oG9pDf@=Ne$uf|N*QCgSYkW$RP-xP_NubA zBrlkrCg<&2y_AGcD&s*7iti9IA_n6$Tvw}`ojq60WzElvnfobz6TASoGVI56Y{`>c{zNWJLccg4qbW>G?-d-Wh z4^H1BrKun2svR5Cn(mRae9znG&gXYSwT2tgLDr6r4Ezw&bP%ZVu*};qb8{?kMaIpI zhkxmNY;j{4&301D)xqJ9Wa{0gY|D#i-xG_ge%dVENA&9$Ma!7-k>i{NZD~D!Q9yYL zj@ZNZ)4*pa4B#F>`d(cQ8Z2I+C6jz_&EJl`>duaIg#a8_sWQFd&ZmPwnxN*)<9-%N@1e!t`)@! z?sxNm@RGv4*`Z`?eR5}vt@imO(GB0#Ims?OpC}Ynmw(jkTooftcr?Iyw~ge)A|q); zfO+WsLek>#(+U{a9o>TS;>fwEAI|Pme24F>vg zyBL=OIo584S~G7S1BNZQ8FQr#6Kvaf(is=(lBLB)Ey30D-bQlN;tZOk-oitpQ-#15 zN^_RfjHu&J5!*}_Qx|kB%Np)E(y!J486y|aA%Xj=FGOR&{XRx2<>l45sx;`ZOI8bG z{;JfAXL&o{B^YUdLLNHxAfuc(M^Dqj;2giZ*&aZpt*MbWxEB%V_{*;jSd zv<7%bjIEZLNdxH0F6)Kmj~xv3uGS<`tPGurFGqD7i36U7!1}!)`E0sFBiF#Vm#AA- z5ug}xaCmsPRz9z$p>A9~Pl8A2{jo11OpAnoFmiEoy%n-rP*ZbN@(9D!ZT7%B49d(* z%-AT}X%q08O{Zl{2er?W^?f*YY^P1_7mt#R_JCAzdHF-6nA0eyor{e>dQ2_5wtG2S ziGFhTla{go@886)DH@}+FQ@Tljt+HGJ(VIoD~|TP^YolZm&`X7MxvSdRk`2{eO~yh zM!$P^A?XnCxca_12M5gSQK-EG%n+yIRx8c;;)rpFQ_EtbZz(`nCg#(63 zq2b{PLR?vy5ku#a*u8;S2V8wWWsW|_3s7LgunP^F%a|kM^1I_Or!QkU`#^{B!Trkd zpB`;(#etkn1k=Swl1J6dn6W zzqqw7mOfcXWS`~{GdfeTmut)TsLj@&gzGTb{wWfu0hK!18`-iGTAql`+lg|4PLx5N$E34XHK9$cGNGqc&^7WnK;xbgY^1~HYB^t~c#Ywp5=fr_I!~-VP z`1}h>OM7bhW5aL!=V@TJzgTT-OX`c$oC=zQE?AvR`nh1cuab4$p8DfR4_Kf#smBu@ z&nXUw7UhLz`U1;BKwi$Or-vni4m$m=$|I(J(`mY;-7%H;{PZ!T^<$gCNR2HG@5uXF zOC3-de<%?I) zuM44!+S&$1jdw>L6qRT|>TKZVU@Omy!}?feC|<+uEkcF7cnqXU`!-;k% zh?~YVw~>!CD>aqmYe>jB^pRxrGyTrx9V#|9Js)54;j%*~3;2gxRtKdKD-X=jq!_w6 znVbY15Xf)`IRhlHt1Ega+gDw78Xg_h=g(Y!@|91Rx~`Z|y#4Tnyk zbt?kbd;FI>bHy9Cl16FFhXoVx9SOb5?Q;rKTBbavHcNE*q8lzl*^_TEiNvyn zg5jhPGp@~C%YJ8%)%~2RJaim1%T+^V*09Cu`6^aWX=6`Db--luusvjSH(0m;&okC& zvel6l!q$IqqtnCT`IlKlum|1i{lKWn-}IO*3T!n&OZx4b2<<8_jn#tg648!)e!jCE z-A00I4_sq3g?8>ll#?`BIW>0s7huvRiLCJL##jk^XU!;I5hNuegBn6pGR?>1{HrY9 zBaS?GLP9Mx$v7mJ#HK#+JIB}FUa5-D~g0sZP~pXvqha`w$+Z6 z%aNZ8I#Mh5mibfbT2Zk)y7n;y6&=0v6v8bMAdq96vwI{gB9d{7dp>8!*V)YJT$f=Ke(pq!;whUQmhNmz*k>OYT6R=RU2e3i(v3d{Pn; z9Nv?8mp7Dzqyt#{`-cA16ch?Di;Yu9PdXA3I+60CQ)N zibd1{+PFJ@M0safB)R z9hp%IsM!Sv>31+>{^&+^)|B3QhjMt%=}kvXoWJV%iB6S+qUrgMHci(?5A#INkCxW- zj;3sM+}sFChkFav!N1uJjej@50yWN>RBK)SqH%{h zILB9>8VU3Txb>7KS8lfZD&b1u2C#Q zF6!|Jw6#zlTtKC(Xb^PhPB~nVXH4%*T*F6Z6lL5inl{{M{SMz`M2*0r+M>tC<(2w- zpW^pSrD2?w$az#uRn`6IX9_`?q{?i&v!1ls>XLh~$)u_EmRXHTHWH7a&Pr)M@Kax2 zTB&GR)a#LB4^uu6yL@dbi~-5bMTn+58~m)9;ChJ{P;vJuu8M0jOAr``UNaWrs>~J! z|EKs%ejc8$)Q)Mh#k);P@}fPW+XkGSFDio3dEE|->9Zll_1j&)j^>az64jf6p%xAe z9U;RCsVM_ja}J142w~WeoUY&ZGG;fq>%QGFr|{!26t zgHUHi860l9Sh6z5hty(g3H&IGn56MY0{w^Q`!MC`{eelPjm!Caw#&*ZWlT?um5YKx z?dRj==&>Lwi-DFbFkX@%D3efu%ACaikCBn_n}HNuab2;sf)C}z#g1B?6oAd-4{ zPP5^Of2nI~Uf!K7FWq!e;}H>sk64&a<_%yvyY&#l=&qxM+USoV$eh-bc^D_pYv7QY z`g(5r1sZF?_S$$whBbDj&u#&3|Z@p9IEO zvn0RF-Rbxg&Lzr^O5YJ233jZAt2c}`ucVQCF8*$mPaX%)4SxuFJ+kSsxqc|eqr9E+ z15poBn$`;9uYi0~I`Wtxs9=9_u8UWLFeZNG=B7O@@28Gf=&=!(mX^X;n~Cx953rXH zBbO911mf4$)}jvrul5KY0;mnjQ}Xla7Zw*e(-fv>XYr}1l)StIRv%CPiLThrlk|m> zyD_<_GsYf3K1Cmdf14Nrd=W+#H5mBwl@0Oq!jD=~EdJm*UX#-1yONch7EM4<* zWA4*sc>jz%%^rJ5+ih#-AxNxtM$jg&nqQ1?UunA3`(xAgrsC?Hp|L2RZA~qfcxdw) zPIDiF^@Yu1BfOKab#R~;5Fm-}`DA7Fo{}=t@6Mdl!Bd#k-oB<$7?q1OvL+m#f?{|) zN1T+$>NOlJb8dhIbe)|rI2OIX@oBB8t;NN|`=p}snh%te$e{cWF)BiZ6T@2S)e7ij zv4D|*sAy>Bw`c32p`md4Q}RBo2Ay49<$I!n#PxLH`}kw{XI{39!^{5GukdFBE^el| z7yvtm_QoC?6c+EXR20?QvRirK^Y2{qD%a+B(_E#l6As5gBCT9!Wcr$+|C=w!R=BSz zQg}X5o5UaES1d%?jb+KwNs#>5MM%0$w*j8=K|6K(PO4k@>P4n-=W*~~MBl~4stXzy zX(NoN6Y!oKsNC#hEG}8MGUvrAb=W)VfP*Ecc&6bTE?x*|tgPLAeX8o}BL#QaE{8Y) zA7Jcc+|%K~!7T?v1bKTPjR(|7PF|j|yktYhoACFj&qHSB+9fbs8zR1tR7?8(rVg@0K=Ngb1{sX`SlPcXv%2Sy(vsW`CQxwY`;HPCx35 zPyv<*jhOnPCl$bbzJ;lj;u-tX^wpVp6Py*rC6=y^oPvwd0~DD)H@6BJ3BAAL-FoR$ z_D$cd4Tf4pUV3?te46wH&=YspGDJ(pnSY*kt!!j7=F9`i(=&1g=T&g6=%oxv68zDp zO8!vXte>YM9J?}z`+}4zb9totm8L*X|68%!h(pEDaBMx7?nqd#^ZcG(l=fwMiplldIad1l+@o9x5YK z50p*pN>F~T{Ow$A7x^mK{V8-r>id9%jt&`e$a)oum8dpppe231y>{nJ`UzBopk&PC zP_f-a$#3m-mp4atr@R*3_Clhv@d(R?NjsI{4PI=Fa*%tgByAm${nS^E2IQ#7USzQ~xL`lMLYHqq1%dCC`K zAY+~-&;eiakxzMZdtE4FWX2ypzk)WraVYs>>(~&*3L%~qkQN_IRX8ZRMcJg56#eh< zf2Y|Vfb~xa#G#(NeKl@sf6L5Xw9m%I+OvDDTFQt`EUK~+J7jfrb1u2g1Dk>^r>F=m zt}mo-6MPJ4Zi5m3fwdMos;c~fJ#>k$AzyEMly|F+G?h7o zya{;SW2Sj|zQCSwY`5#!V^f&-cr(eAet#yL(^GA&mGIQ!+`>3$t;;(I$Mf&aQfv~b z{B!?_KK1oetv_KJ{Mf95@kv&-xUer-n%FVj7J^=5yXnNFqvrP+H3LF{429vf9nKpn zkf;OB3H;}(KojLDa2zF%~&Mp8|ONPob}{RM5k$H=WiyzwnLtX zaN=^RJ!Cq>r=mi2h7MMjS_!W$nb(+c6_SA)p0l%f1;9EHVzOzMwEpui8|!y?eVA@F zAs)fS$Qn~o!Lj7PAH!;Y8$*BQ%_-Qf6bwX0xIBF#rjI2T*Hd7@#pPRAcCfLr5%AeU z&CE3D&5O;c&fP3|Vo=VKkg<9XixzllYt|spw*n-&#kjf6Sa2cWj&5hu)O0{(CxYSG z!Xjwy^O;~k;4GOn%xa7O4sOXNeggtqenEC?8pyfRggLW`Ay=6EOU+eUNts~R`aU=N zGd|VnMYY&Sd{VJ15m&7ABU(j;o;uDHs`v<39BQf(bEzxRw;#@b1@^czi;*?Be_@L#;ImS<})1{ha3Xt(9xj2AbDTAU{Y`~g-ri;s`@YSrJC1!Uuxx^jalsW6W%e`t~# z&5ggk^$!4yJ_9eJq4{^?4gon9j4-KRvdTSFVXX9P5Oj89iM&m|;8jn>~d2kCFnOI6ko7>pTneTU=R8FyRo$lw7Zlt1cLg^ zYbPqs_+H+geBGi_^I! zDM@toEK5URywlwX2Kz;Sd0^Lb{XOB+*$n;DeZ9z2Km}^{SoDCIko4(4WHDX$5^s#D zFS(TGqUMyuE454kS*4W%SS$=@{FVqt3jlkc+7)95T+_0Rv0AF9l)kC{M+@MG!QSQ! zN4zFflc>yuz+1&_{N+_0AY+J&C+VuY1DYXx$%y{?%{={>$JgawQzQ;qj^03o}g=<%OYgdJZKV|exObL775DpSd_(i_@n#u8L30qy^%mSv_k{lvrX{n8K zFR#E$BDvV4Z1>^#Nv^slb8^@Lo1>wr48IcPf{nv;#iW1N*;{$x0&7~AuL40Be~&E%-nDH0wS5-tH_$eQp}BAZ{NF;4%>xmhQy>RsOH(s zg||Y(x3wuEQxxT(eI`YZ$v~mF{nnd*Xga2|CYKkU3PGkl+=@OC4KyOvNjSUKAM>3Vr8i?~iI zLh$#u;v6U_rnNfI)^{U(7T6dT5ImLI6N`}pvC9_hQh+^sl zNAG*LvHbGr0+?`0?rEq?poBO}SyFn`F7onzO!do`9PEyMNhzn<3k8e}&?M0q6pgZt zfqAr6zDN*P+%8|spsSc7jmNZB0`eAbS|q6PyF0nzb|Z>qe837L(MY#qTF-{1;r1gh zjoJYjq7-!Iu{SE9n|$z%%J3y}{dT{~OYgk&zFx>1N~Vcj_=5 z0klm(!q^#Yr1ZFz%8`q#k?(D6TB4>>F6v-8a|5=D6`Uq_58>v%!O5$_HBAY2qTum zM*zUQ2~*j~HlNu(7s^rdL+Kv1ny)<;41CUr`1JYlZjAZ0EgK>NIHv_?GQ6Y6#ERD= z4I)%5JK*kEQu+ICwGB3-SVhk}hdPH9Tn2zNno8lXk$DOHQbmdZFQi^;7dkAxOg;#oC z5`Fy|ayxM}uX(2Q?=yeAiz{-kXqc`v0zh7&zBpy12OwG?n zuAcUpaT)WFL8sgEU0*mV+&hoM6s}@e(hI2NCc%W|OA7Q;=A?c52Xz?cF!HC%>fg9a zLc;7P$4#FsCAM^6#!Uh&rBn&Tc@dXMNU-GwpiP*Vnv#lmozZ2cpUc63YVO!09E*Hr zRs3Ht#KnytDm0vtK!C|kek3P<(bW7W_lbBZWEu(?-@Hg7SN^%*%RacI{r_KT zto|R6dHr860rc~UDhM@3_>GHBVB9B(rt|xUhJ^)AGE5%N+cW4`AP{Em3q)TBtQsB= ze#rlN?0+l7Jjj9R@*2Qa+8%LP`uX$ckCKu|ii(PGuu4KW4z-e=o}O5o!hcNrUhZxB z6?HqYP1RZU19P}D`UEm9tkQ0n8>R7#+qgt2a^C!xr>r3LS{vYGtqPS;}xOCre zEbn_uU*tth7tKQ?R#K*URf!^zg`6>iHWGosy8#%EF9^dcX zayE6VH$J*rbD)GB+vH0th}*^b6HQ*JfvlxIoqcWJ*+;cES=gtawewdL4OO8Wzy zx3@QpN<*=motOasQ7=vv@#b2rvsX3$j|#CL&O>rY-tXAG#{UX+B=CuoRriVTG_rm3 z)CxSS7dh+wJ+|v~yl&GveYG)h2C_5p`KmXGVr6@x0z@eGNG(!ZTGuYk<>V%i^B^m5 zP84p8@#cM1yR_TBck$f)Vc_=3cB4S+qiE{ona}zV`@Y|^zDQ%^Z3e@6eeHy{MmX5! z_u~*dXKjtq1JuAz`10J-O>Z5ZuPYj3eLil@4^riYF#PQK9- zF}|oVxvDXBYvy!o;`DGc+B@3&qi}c%9`xp|t4q8dFj$&F9Te8JDi%fVIH&)Xb{-Ny zwH$V)QnfA{a1mWgz;^}F`dM9_2n1r*IXq(Eh+e6b!;^i03AJG29FEgAD4S}5QqO9N z_ZQ{n^nm*RxDr1My$LxULw!zzU2u7uBlLE)-8p+ypIqzwf(RWudEE`<3?v%@HdC9N zAERltIiIFvyYA{Fm-Xj&^Q4~hn^)bg6bUH6lg*f2?&*(x-+Z;#Gay*YcZADG zkKYuPZS2fnwKXB6%>_tF1w8S1$ba)7WvWT{Yb*l1+qHL$ zJ`T*J!VFmThPoUtc3&is_cNDt5}GksFA?mB-om-|>34H~P8-RM+jhCEmTzRedB=A2 zO0NlGjT2TJ03Mr7$`tAV!q}O8&r!>Bed}Oo%a7TK=}z6|wl{o$*z^q=wTCHjKg-8w z`YkiUzpPG&V$XTIA4OLJo4ZGRvu3H~b8*@O*kVpjD^r~n(tAWenRCp1-&RD=yYX9K zY9Am8?&_|oR4l~+Ou*$E09{`RpICZU^Rm}_=9bM|5BQ_XK5AXFFTm{R^hr8=sF}!#G{EU>Nzf6M6XF{cCSkvUc z?CTJT)&{?uy;I)yXmo-=3`_ZUK5VPrDxZ9vNpxEN1E99yoR=}YV#+?b|E65uI_L_) zG(DAXHy_E`QDqeo@Z467PnFWMW<%mNr1k}mMZ(loana5Svb}e*fFytK)FrBnGH&wp zNDyDNpsES7Tk2tX5!m-Zpsh~B4@P1S$yH*;6&&vwf08y3Kl}dHZRYQkhE8~WBWa+o zu$R!8rpC==F6r>~+5SX1e=S9@KJO}vSyRS(XIqSl52Xl%Md43)JC&%kDfdzZ>eBI~fzw;bzj0AXJ+IbXJXEF3w@AeJPW>bSW`!HQ#HXpv{)MiWlCJkUD0G3-&%MUbJYFA%gXfO+d z(!GnBj%-2;BasKBKVh_PrS63K-lO2%bP8hL4Ll6zy^(hgKq@!rpFTy)5m?UM0{JDP! zM=9)R)n&Dc^@kqea~j@%+d`vX*-^UHZlM|niJ}pY2`U=Rdmn;)u!n*MvD%aHtq!>I z^`~Ax|DgKhuh-=7TXx_L4ET17519In(=x6(HAWn&W*&n1t09|8!*4F1!pxK^HvvC- ze*NyrQ{Yo|sQ@NiSR@2Iyp|8(B#a>K5Fe;x(I{Q*WZ>19w6v`9`Z#N7u4$*&X)27D zYP>ii+$@qOcGW&`^&C^;S$*eZNlGQn^jveR$&%%rh0`b`+R&GG9!i(wAIc z=VBs5=H3MNY5Uo}B2p&wKGeM2==F;tv&&l&Na=|irdeKDM7qj^Y{@EUFlR+=rf!B$ z%};+5bP&jqogJ@oq=pILE8=O;T6f4q{{FpfHDD1BAOGBtvbNrZaAg&~6|d5OaM0KF zW^JfF*pn91=B_EN0GZDmTt~SU8KK*x?)vu#W*TeNG_Sd-hFM?x_gY7Z1znTp6e5+i zBb{WTR4!-h<7}CM7X!mKsbt!4PoVrVCmPpV=N&(7J9M$bcsZQM?;JDGzI_Xq<^MDY zRnf*7_3=9O?_Wex;$Gswd77ns4Q~gNHY`Q0NS_I2UxcIhXs<2}A7 z7ZT=ZTx|X`uB1tQ51&mOvN{W4q!4AxU?XQtEclLRkMyPScts6P4_c*Fvjz8DW5Vc) zHxd+)Fq>sIu}fN_ClnUNKN|3K+FY8Jgk$;n6~R&fjWXTn<~A>etS3r9dx0@OGf{c3 zU;xXBI8@NB0ljw#j8HwY)e2JnY|WcIkjCQehRwK~m?M7=@bASbMPvEQnWq36xVqJg zvSG8=S%*M#s__k_Q5663K3G6^6*7Gg%nkh_tM}mGq+^p*?W?{BxQO69LPLl>UOBP~ zjsZHO!o6E++=!fEn#syrQYHt((JEoQGm53oKPh|_*QYRY0@U*rg*y#*)qPbjqG~{= zK3GvW%6V}rwR(?}sz0bSPP7v=%Vxg(IG}Z=QsclGcia?~62J?rg@}6J_#WM>FL^kj zLE1rs)x})0Xb3hqXFJTc^WR&k3us~lJ;yA;M`cl^u5En=U!j>9845626x`wJQvc6F znEHIC9E5(I+l+_33jJ}{|+4<|ASEG=6T8?8TKd@!wqqZ z#fCe|v3*|N=S6So__DX7XF7ioiU^tqsWV0Y@A#`CSk$4YNIU%eGNvN1nvV7^VsR|kNyI3|1e zYwa_xDo%`hv-oZkuJ{;J+35~}P@0%P8H<5^s)vKA6?>4DdW*N4L)Bh3iU2nDpopj? zRswoJ$050)@c`s2V^Qe2C+qexYv9(r_V;Rwsk>QgwCMG4=kJ$Ci6S@Y(WKD<9yc|L zyPGl=_wU~eYK3Fqj7~BQCkFNo*01aw7vyg&)g&uMyxA{c_VRZoiygr5_U)7QJ99&h zonc~HG)cXlKJ`9b-qki*?OG^#7^?1-8+b;bY+l=<>WBlIbd6u&9yL7`vp=tH{A{g> zqx5+{f6a+hM&i&=cNpHt?3tiRDt^CJsgP|_u(c_w=C!xzI;q&R>dIK9NbA2rkl#0( zP+yG+*}rt~3-?XgHfeTcJoSG79A-6yTun@AS2+l};+dviK0uFF?$9x$vRkQQ4~>Z8 z05!=8vKZQslVX-;LcWJ~B`sVA>+L&c7R=7-jl9Xk-} zgJ?QXYH0ep&c(XQ4ZqHLb5x&BRXV$&Q1FV|@CXdOH6eT0)wyj?Q>nwfP*N}S4j$td zA8rQ+mWH2(iko8rn=v#j!Osd2?G8b?1z!mqF^-CY?}nPnCO#t`dUlK@gHH{E=338s zvCn+g934M+qqm${dTooXh2P2({miuJh@05$nVX-u-ahW|dJ9WIrnMZJ2jsx=>=pFQ zB$kJ>&33b995i*n41INdG21L1bU-=XzOQOU(v{zns$=R{VEGU&9bbs~X+ZZ$JyC)K z00I>)alrpwB_ZjnPh&ESM-q0K>e15FH*Ee+RJ!- z>|0g1{+z6Whs6?i_SGa-$+Cge#14xKOSKjUH~e7z+0|r>mxE8n?DLff#s*G>iMA$h z*5Mj5mH(dSkz&T!&}l#i3Onmo-NcP6iuO$r{EBI_=vHmBcI7P^U(JZwylvplHADWF z${krM|I*uO|InyxwAJZtGWVX(sG)cU{~? zn~j)B_qS+a>uD_ug|Dr`*ctcB%L)Udj}5BaB&SySYe|+`l?>r`><{s?DQ0UvhxIID zWkSUmn%3a zcd&Y@s@uV{Oj(yOBhPQ=ODm-ReE-LFhWYI^mg?*9na9@CxLfBj`%-rSeG7`u`qa<{ zd4A_rPe)q&i@B-{A=+Vz0se`397s(zM}Bybu~^rpuk*AJ^gKl>ECu$0anCAmDExX2VfJKL%g#c?o@&W} zTeg}@+v!ltuFQr>@AZVXqO3W4t3&~(Czzs*9BULNBQ~~3k4;){`7xDy=dW@rZp9pW zFxD|Uf9-C8-%cGUPZ`db{b{e?0H)#8Cn$N)Y~#L@nxuaYYSvWe+$<_c75-Cf^V`J| zKv=0N8&{XWWBv}8u90Bc>;5VuqeoZL*nu;4z?xRF=R1Hke30@^zr491s)e<=a>SrN zq@I401D0gV(Vw&gYjD%e&;OtuH9x1c?v|XFP0N(sX%KR$JAP(r0Lly-^^yo1^*SGG znY8{@YZ6HnKb~t*|M7!AWL|EvdrS3i!fFP1w-1)(j(5}Q+?*D0ie=}29jj2xkb!-S zrxlLXMX+$a-c;Ti6WjEh-P(*3eJ^tP^xQi^+CwUyA^RFt5-7nGHSyth?|5O2hT^*# zjfJh=nZG=dC-(AW_;owdd}R*S!<6{V0JB_>omEwjBC!c`8RNu{*BD_k5|tzh-&NkE zlJa8z;ugAqnuZtPzD_Ds6f566Q?Y2nnHAv(C>Jch6p35^^URC@3|XJ6Ji)QSj1b@^ zI1b5g@8}aZb}8skQ)N_x6O#%1%>T)YLgWmq@E(DuX}wGAgDMZpEj{8YDtnd-=ENK-N_N1=M>>tfofBQ#E%H@2LZ@&P`olM5LoN(RE73wx8rMb>%7wq zmzM(u@yvx=SN1G;$#*Twcde*?%{LCr|Mjg8wjacSgD+$mD`ZKg5pbfvif>#h7XfS!Z&g0hd z$itDv-8g3HLeg%{54|}>qta)&8X${eM)M1Pr^NfgZxxiIezjF(AoDk*58k<|BEmH3 zTKBh4?R&ee1B27hgl~00i~>=6PRzL*crEbzKT&^4lU2_08ftCS8j2NwK_mB=5T%=d zWPe+0zMFH`RI(MZ_Gy^H?7@6O{6{+kJ3GUE{Wj)f=&Cn^F+cWh;urv7?Us=^j8}TV zzdSVCAF>EU!`Z)1)fWpqwV3oDQ`v(1gs)xmCTh|>t;A~?eAUA!p}RC~w>p|LPCzsg z`IbfC>;7dzq?G#+v|JbkdJn zSlb}-3S$(Wx5|k6_ufRnIH-!pRc+>g-f%+^$+g#x77OgzdZP;k&ynQe+Ncy0VrclRzA&0_gK&e60SB7LMka=cg$lb z28StG+HmwKov%EKs5(Z+K)%$Gbdz<&pybnmo*JHNUB0(%ozDdE*05vhqZIk75=}lUh;gz~)wEQ*WnO9$agA#*z+!nE ztvg5ffI5W$sp73C+nmHDKIPwZcD)ECH~mJM2!CWQSfZDynDjQZ!YJCaL^LMncQWOn zM!fnskW~Z`)Q8pJ>qp8qlqRGjp6vbXp9m=r8uJWUFF%_H=;)4tN%`SD)w?>#eMEoM zGmN>GK-=OenV;fW!)Yz<8 zDwVf_KTgD}{ptc$HnW7rU^dnme4rv`$TUL70#DXWuNoPv6_S$0y0;GozDadjcL}yu z;404ch_5~Q+*a$2$W6pzEGwt*j5z7gO1PQE?WL5xb9BDB#Np25CE_*e#ro;}$uJS{?OUf4JxCvgQFys>@b1tW0?ph>=y#;R$m-=9M?&lA7|VZJv~-@Ug~zVH+B(geJ$Viv?j+e3}}9~L8SE^YwF87%IZb3 zrk!dQR(n}U_A~RAeQ=A@?Mdlb-*a%HXJ^cl>Dje}?s+K&w1zhcjB)tWxRC$Wv-g;l zaNeR{WiR8WR{ZA3Pj1AJ5!BtBKF8wbj3HFyTz(9!HPF!7mp85^|Iq>*1$LzOa0n0u zG@!Q4CbwGxwir+oO>F`ewN-*@ShaWo~1^#JO1L5Q6?l_O?x}Tx<*41c- zJGxoMGwXAmiC#8Si3$*?(!6_NifcMd-K0T3JYX`_S%}fbQv!j`yT4B{eCky^$bU9@ zvZ5&y7Zhn}$%FKnmz&LgM5xNbkb$gqO#EIN zu9nh~`DK%z9Zq}qsYSSvi$>MWx;ioN?~vliXp9{op}*W^>RauH$2tY7fX$797FzLl zs3dq3=a+X=XLo^}1IkvSfkYK}9JO$mDJq=q=UbX*mPfiq){TCdj%ht7zbJ+aikjMK*(t6e{1JlBm%WzDx!{ar$!&RE6`GhB*hvMcz(?bt?%<$c``$4dBA4z0)u)y4%p9>$8OHeo`yMm zVc&db^MS*O-`r-u-5AU*=?cd*S*aO7y!~rmV&imnkR`W4BmLc_&tCt;);=sJKl*vr z1e7XwACko4w-3iVIvIp(R5*j0GG;{Q-u1zbtHH;biS6Bc<*pI=OE4$^oJ#?8jXr$fR=z=|a!*rH0ymtjY zCi^0SKTv+>`H|DJug741Km=RPaOih{-Ma}aq%8u5_;@R@1!XL&GcLR3UEoer(fgYK zqJ>2OhA{UyE(NlaJ2Bkx3I~WVw>myf`9n#MFPmi-r(QCNo&Td~+pB-7MV_nv{HN{D z0olG8nWDbQ)>HZCH;ayjPmv+-QgJP54_oDKmbGGSkNiwRI<~4zYNfs>H)dTmWIe~# z`q713iL*JjYYr{NQ_c#H#-Y!vo6>q1qta)Hw)2Ir7@TI6)dGQy%(#mOuc8J&&GiW+ z|M`*o-z6!%f-+gbK5%{P-eNd>_(}CR9HvJ8sC;MLu6MF%=1AFN%pAz3==Nta@@l<9 za;n4zR*zl4yyob@l)}i9%@MKuAC7R2>oU{>*ZPXu;b06WUUgOHyvD_mW!8}lw`TBW z?{=X@A32iWlE&uH8Z*t_xv=#b>N~~Sqd5)WvMKFVnVCh{j1R2N1aht$XV&a8sG4=i zdCw8QHK5_dYwCGF7}iMn!(r%mVcBtp6bs+~BJZt(;%>gZK|+EgSb{sj9fCUqcMIgJwv zdz$2F>^$oXTu2y|EFIUOM?v#yKx}r8wE2YG&+N&*4{M3&-vL!&;(Al?i?{Z?N-}IL z9-qu$rF8{tMWF>2cg0mBP=Bb~G#x(YrME;+yOdP*k$zsf+bd`I^}r?g$EP9+TLtzi zc>P}Z4*w$YdYm|3aki<;v=9BTU_ZCtx<k2j(bg0HcZh6PP>KZiM&j685$q3_K3SXe>nk#B{P7iZCK|0k?#W6R@0NiT z6Dwb=K<6_B7-Lw3z>tlefiYVH=Fy0N5JsENXS+-7e3AHUY215bB!I>(U_;i6>%>Y+ z>Q)>*0B~dX(Bo{{`U_B(*?@KJkTrb2#O!z&fMu_u`~5pa0vPd@$_avJLD&w={yBnZpX#uKpn_j1hK(~!l5Gd{38b_40<(k?qK;(;iu+>mA+GpQK za2Ai0nc=UI{=YVC4t@i`K{$sds7kn5MXg~MilF2c>qx?^)Px2vxW^BLBSs3hK7|haFhaIRzda)t|w`7t}zh`xt=>;{zXW)F@&E%62tsh z#2Clf0_0s!{OrTMd0hmtn=vdJj&jxYoPIGjb`jO!{?VvqF_B3u_d0xAJ8N6pIuDaD z86!_)4DAV=ZKUjJ18?TP06pMYu}KbAU@F$P#n+Kj8Az$mu3Ydq8?#wab(;FRx^XE;N3LeF73Mqv-C#3!c)?JX3J|9z~VY2%@urp~a z*yt_br8AQFkww?j#HSo6UzH85(>GeiDh#RObq!SuRL^6nrAO}q8B524&8rD4smBb- zp4yo+v|&DU;~j=sFpnP6txYDe83P;R(CsamW1ElgVC>OQ{4&2wg_{TO6} z+~W0~Phz+qHKX$-50i-R@K(HuaSYA&$xA&_Mes5b_P6G{_jG_WlW!=mKxX|!YeT^J zl+YQ6$nI#cqjiY^R&wWTmghR1nc?|+47bb&u(r75?A(xc|aGee`HSz)Ov<( zzwX6@%~4^2n|BD3*C;K&aah^=#@qfbPxoWoj*5q0z5u%ObXSdvXZ3!vs;}0>YFK=_ z`4^!tmA%5H+EdYO`dG2Ovss$g0vL4z+8*w&sgSX%?yuSz-Z%Kw{>C-udjlRXtsSTO zZr}t{2#z;G7SvuQWbW#pu#Utr#^TOOlwEiZWEAZwfDi~KX;Nw(Nzv!otF_i>*=6r= z`Fs+77PK!YSY8Czm+!F7>=Immw|G~;AZ8O2NXVda5Yna_%h>mKV|O(m3x`R7exB1&ZPqFZE?`yrI_=tv(UZ@^70 zN!rD;%;76miy4mt7t%CmaK2ikfd@}y&;fVP(5Cok^Q-uD3nlPTxalCnqw%bMC5gcR z+NSLm9h^QX4d9AtdfCp;`}*SGe7-s)N#E96q)_;9Tj4X-Xt(3eodFQz9WmZ_S=E)? zUQ!`w(&ZW%6xt47vUDzpLO+3<#}-JcdlA>{7&02E_E1$m=x61KzuA))f_C!GO%C?@ z7V4A`ZfG^W33A#5@@RC2N*beBU0KbHSQsQP0#4R;>Jq>wV;%)-95Zs}qI5#Rf5{Jn zPDEMYd|3nMEWk1j7uKH*k%paFG8^oOtwSpVCYP9)j!c+=?^WtKwaWzwK%a4izg_Em z-33|jU>^}r48`JlF+~QtfCigtOt2V?K5fBaFm58Z`zCgHojV+u*=6p4vmXJgP#P!P zzJB$cpI3$q_lIq}f`)$&{cV;xia(HA%+|Wd7?4}?r-?pvuI8Cd?&W@WwTjJd>8!Qk zrK54@NO81h{b19_2g>EnhY4KCgf)VR^Lpxndyrs9pEao@g=kvD2sa({KyIeqFSoDg8dfQ3GrGWdY$iZ(_vP3L8yD`BQQA&?tIgSK)oJV zagEO3Fd(EdYqt09C;ntQ>I1SFyScgKnCxGDyZoW`^Wx_LWob56eO~>2K|a8^cke#y z!Rh~EIoyUhjNbim^_fz=(i%eapDoF|Mcf){}gC5z42OOA|i4ktr9}r&7gM0R+ zNyw)d^s72D!nPfa`(R?`__-xquM+|i^6OU!Xhh8KtB28^UNVH0SmHYKyh}%C)odD`T6YafF{AZ%QZBhRJfTn)o`}Ci={vPE2GuK~lq~E`` z{(ngI{y$9ue5*S7(<#&Q9B2IRn=WZMAIGwwuR2*1+TMHW{{z9BCu`k21RkYHTBH91 zNgqwEtgPDR{XhSgY3}b^Qw9af|47^=B=mm+rT;G&tp8t&RtuRR^4)XOR3o<5K@S`*EMn zfyE_tr|rhtXRh*M>eCGLj{^#n8rf+x$rrFDG|Gh1q>E3FSgQsV40QWQHH6PcI zd<$N~fbdBtz!h?})*d+U{=cxapncbrBC3+ScJR8-d*wj7$iEQ-)ZqudZC%YE5SgDT z`Gy@}z2)L+V9jTDGE?H=z5{t^m@K>ZO>$^vNG4(VuNP4-BK}AedrWccCeJj z`S}Fl{ewb&C;{J2=Vx6*(4P%PZd+7vY9+B?Ua8*)nOGNYfj(TXbYo@Z7Qk=2F-;t` zR&Yxm*mm{oiw(4?r;G!2_;G*v+n|v{!a?O$xWp;#*lglnHAh2AM{u3rxw>;}b2>KC zXHe|#`Ue?KY;7jk_%iEOAw_>!tv#vA$elmbt`&8`cCI=&$gzlTs~;6rU^gjP zDm}S(HBbB()ZGg4qrE*_wbK&P5S4?VE@X1Y5D`c137LonEbIG49taUjwp0e~&?;>r z6JcHskCb;z3It+gtbb;__HZ7j{5fVJlJv!1C+hOT+aGe9TY+?GH5xG)yA36=?rDiD zE_^+}5kOVL+BfGf6*dyY%lF=hsWHPZR{@rb7rF~LP>O4FcA0N;FbJMy2d=h!EEuo{ zvP|P*tM_FC(Bbqt*wFJDzUtGKd{O!k#)oGc-PV*`+h?pS(j%Ga`amzj04jl7d`(n& z$L@Ux&!)xvn`{AzIGPFLs#lNA2H)`ur_~q0fVa3Z*WW}SH_132YP>kwqhuAWiNo+< z|CqE|Z%m)KkOtk}W%#LTY*)zYGTga+x@lf4Ea7hGFc`R0e1LQNe5=LA%_*(f1BVO4b#G;>L;qxkl@-`Mlr}NO-0^Y0>2e45{X}b`tZT2O^+zD5j}=Y9WA!JMl#e)g ze)%nER?x)RDviqublu3p_WcedIwy8-P5jnG8B=Ua84^6X+y2nK9g`FvH#mDGFBGV; zRqOxM6CZa@X*h5v5TdOiJaw+GT;Y+XM*JMLv1@HOa4f1;wbt<~{fovQC>8(Q+*w^g zLLAnC&wOOJ(|(RVva2$u@&G>NL}Ex7T0#@kM* zij0(9*}B>a?5{UER5c)ov+X|iLZd4z{}_o#hY<^vJ_LC{yshcS7Kb&MDd-gVT6mW0 z@zRwCWoyy~_x2vzd7Q-q*LhSn~XBN zoB?z6v_yCAsU+I67yuQ{gcA@R#TzrSw_+c0P;K(2ml+{RDAnd)HiR4Rf%)_B=a-j9 zS|dEVWfyT$)HOQ*gI<5(NvYH|*SGYb{|nHPF}kgPDy8TlQmOP5^?Oa2ETPwtr?E7A zmmviA*en6f{#ato(o~GtzB$yS`yOXt8<-X6bTk#wQ*QEb`NluA7?k%oLt9J04VyL6 zZ~NVQdZ`iSw-sc&B@2)<8ni}FBeQ`KuX4@bejat{jGDeN_FbD1C$w$~2mBtla~gkW zcwNiQZ-GdRdY`a=9gV8t64O|E=o0CenUZuh!!{hCzOL!+mQ8Fm+1d6)ZthaIe_v*W z{R(kVL(G`d;S#f; zAg{_$+pi(hw=g+nt;uqH$VA1Bm@YPX@G3YjpPtZ7ihhq6UPKBXMu`is)Z7-8gH&AQ zF)9=xt=r#5Ihr{G=r9#kC~MFTE2|#K3G4qLvb~3un2M zmiX50Hbvaxg7yKWT*5pn_uw1$_?tJ0FS?|6MoeCYPATJaj0z1oOKqAC?@cE}7*h`!-6fn2b2d{PAmAW6^SJ+%ucfd*ztSXBS44vc>y@SfDu_mqc+@17=k~ zq`|SDs8|Pmu}!ivJ`9=%qU#1ZKZ6vW2TweJ83pUSpkO@Q;+24+I9Yl(NoFDxYhdw6w%2W3e|uRHK8TWuyBqw{@KTKb%c zW`ruM9Wb-HomJ2pm2xfM-1@9A&y&9Dw0&wP_jGNJzf#+f8wWQw8;4feRO)M)TDJnB znpjmn`qjdv5E;8%j$ZNB-q=3SNYv}|~bX zM8e5Oqaw(Fqih({)^VQ!^;g(h@H??3bn{4I2$wYiEgBeBxMmPpe8MW~d2}^l)ffRE z-zfX}3@(mL!W~2VM~Q-}JQu0I;&-ceT>3^MZ!=M9%?gOvGN36^@&*FN;+b(vq{aW z+5XfPv#>l=4m`Kr_dXKS+COQ@7H}i>~8nL3;Puw(J4aej9tXO z_}fAvZkg@v9EH`fbyvOs*}f=>mIu{y26BZf&}Q z_T(7K(uje|N#+696I;RlpLL-}_&R+MUXx0b)YV1dgM1;{jjNFLvF&g1lS*O`B8}^1 z4{7SlsTWV7CIFLsG*(Di$$q0+^y4SjOKBQQ4z@!x4ABfr0+o-_6ekTfhCS!yUI=NV z8m^@d5M!v_zu=$HwEx8dn2C={gqdePEWu@(7$u6MFTyocR!c%4z}D)+U61!`-K`&w zuqiwpGikjy^&K*{BB+`1-s7ivK0i>aDxmc>?nj@!1r@W^Ups3R0%~Wk_m5@X)lQ0& z@VzJ~oA3&#|0oQeK3p>7y~m}uRv&s0zFBSi;~1b9{o_SZCh?$?ehr9h25NKGn;r2RaX!0V6Q%dJ{CvB;8&a5 zi1ByKlD4pVTHvhdif#1)WoYXF96|D!diMCo86r+A15#en_*$r7xQAJcpL=)CqJb<09AEXUz~*>QAVOcl}by8N1kCR~lNPQtpCFQUmWn--Y48qQa1J0Sj;lOdq%W6+QjQ|2wKSt+1baT3Qr$PtfUTP6Sv z;hxnYRu@w{nOOtK8PTJ;=fg^}lj=qo7u~_?m(q|nHhQn4PT=QCX>no)H*AHT@b#Ve zgV@_$z^OT;ZRJV0fcd$^?YWOWed@xV+>VF3r8u9_T1Z3~)4~>a0^bX_FnQLoOH-UT zec+LDr<(No$RnPSxRiWenfg$SS^LMT+?3Ya64j=%XEd*JVkpb;qVq<*T}x#XTD^JG zix0DIjlQ?knR+K(-hH^in>~C7Zpwdpk7EnM!76P+MEu0+( zdy9Ib_Zr0HdEX?l&E(zvOGD6Bw)RKLtzfkTk!nh^P;Gt`cA<^D8p>>khFITrFC4Wd zJ5G@~5;9hBL}+tq7}Y}d*~|4_TRTKb@d?h+J3`(teCQ~E@ut?)Z%?4>J_k7jl? z9)n>_C>@t}jQCJqjf79#UZ!PC~Y zYHh-uFOxB{;MQyBeFzz1^!o-ESMOUh^mC@iY4P>1C9b^QA|`nMCe7&lCtp3xB&2WF zPk&4fR|@`&_A+0`G>4jlymU6aI_8X5Ob-jZcv@%m7sh3LxJcvH$NmvqGp5_1k|NIC z`-9DQh%8MWw1TF|ckJ|hsUocv-W{aQtbjttPyN3ND~-=&#qdws#!xwfcc1k+&-v6P zfk*g=_(`2HVZ!YY*uGy@dJU&B$Q|R#*%bwjkM-#ujyhg zZlpO=sUB>KTzccCRA9|7QuoVj}-CfA*ED;^jTOJdO6$m+$W@!(>R@ zB!n6~bdbT)T;?+=*1fPElluTR2W@~F@d~SH!B*`4-NMM3W3B9mrbpHsf3CIzcyywH zh)bS+5kx$DO}i%XH>D|Ba?766YavU7=BF|EnutX(0`WCL&LJn^{(8eGvnSlk=VwDA z7-HajN>kVlMI!>2P(Pxf!bILORA5)fZd+erP@`M(v?n&V2awwaTYUhp4oD;QbxN(K z`;+6I8$|ry`tqZ%zDpH;4UMj(#9P=M57DQJaJe@P?=7q_TKi!GfzZiF7QEXV@x%YF zS(c0OSRH#QUT0OClK2UfS=~5Ok6GF~kummRBQL+WGzLFrs&dRT&561hdJ+Own#jRx z9?GjnY74Ap2-Ur=R(7O#QllJMQd-hs!^${sselHBP=f2Af}i-iaN zt0$|4wrk9z71mD?YL|jhV}X?&JFnpZQF>n2g^xiOO{|Qg_HDFujt(b=jvu) z^kij6b?!JHqJ#tMkG+PR@pbL{ZgIg_lyCa4l`(fv#rz*hWLtivT;h`}JzK22mn#T% zNJ0ZZNjFO=3jkAjQv*?qnfxkxg8{WDY`TzP{Di;ME)$;Er^MQ3q&?+ULihUDZA-ma z%2*alIUoMfCD=u*CLw)Nk4YYn#@n3j$;HkEafPRh{j*cTc^nV#((sz^X_VGJV9 zwS6LxWAG>E`t$EKw>5Vltzn)Kov8>fN=Zy+`pXx_@yXH1JzsgxI=#CQx-~W_zwbi&~v4U_BAHJinkk`UwT^z_|`eovsjhKY+Z)yRz9}9X+G7G#JD)5+)=g zPPAUVkeYu`Y_uVYINk54KuAjA_pPHYuiz{Anxjn*=`=cw$!FX#r{y*L$vXA01#n!~ zdp$dGsXDak9z>Raiyw8HtWp61if#}BNt_Nutb(K%2v{kjs>}O_)+P<_Oy73dEy|}AJJL)it@^Bk1Hu)L;UsY<~0l#XSj;Cx{xIwi!3WP9b*flt3S8QrHmzg zH09*9S$-Z$%M27g#$9fDidE(xl8d@w?TBsbObU*(^y*5>FW&qep_j}CE{?4-^_O2% z8c+k@g{5bn%jh5{36RHf)Beh_3g4tL9(mFf$uInD;h>-pHMYF**bZMw`pv+TAjRr$ zc-RdNas4H%_kQ7f2my>P3H*yW%lsr6iU>+OzC7J2BDt0yW&f7Y2EPBy7W69niFwie1^J$N|2ieYh|`~_ zvY^8y4mO#FR9?|EDXj*J)WpU#MBH>L=h=&K(w2su+MCbt3~C?i?ohwmE9fMR4f~80 z%&zy4L6q|vvfaiH1OpGNBDR)9G$)_2Ax2;Y{4b@kQ|*}DsI6}(8(AiQ7Xfh~R5te0 z(*4T{yo1s2=#f5yBRMFQoh=={uW3slk?WbAJYs+sj4Czm`O22m`?fRW<;cJ3M}6+g z-9MkjG8MAhUkV0tM%sn&dp<3szfiD7q>B<-(Z8tR!bkf1W=~$DTY{9B6ueD$tNdzO zIRS|92O5vd7oc993p|OnMu>>!iAj49t{^mKB?4o#WlI1Qv53}Lkdv0J8)sq?v(`TD zz>4C^uheqMTe>!x;1kV4@j4qzAZlbQBQ*6)A|3geE%Xx1+AN0Ut|$~vi?u-r(OIyt zR7V^>^k4lB*vZHn+_ckBqt}s|Qr$M&(EHgcTz|eh^P7d<#q;t=2!|oeH-`f+@wGcu zRw%Ckv66;+6lR56ewMpu0Z#NbDDtRwx~_XcCSf(C>eKLkI3!0(XzVKZCTRYo@6>CqHC9LK|3@(C-T=h(h9f0RpH%5urN{{b=#-&a2)jSUGs?ee~f*~e!KJwcH!&qdB<&b!Sn%Fz4;c) zk`O~#EqK|mz?uWSL_BXf=!&&>^%Z^wl|iR0g%9_0GWb~RNf+%u`m0h$S1k#87m#4C zZfCDlJRTNU8rr^iJb#jBs{e|D8MbK6I-~rl5A`0lqC%4NItQS_)}JLq%_RE}v9>^w z)b)Ezzo5)Bzvd{(C4T>w@rxbHj%Dfx?8DmONwM8p#RNKOgQvk0+x+}D&Cc^tO<%Ru znWtyoFs63LTTTcsy{qseh?z`Q$t<}u?a5~H;)v8s9U5P2iq>oKL z3nLpKSFL0kUUXVz##iL#ji`ibwS=jz84OSh|GjGGE_%cL1P1 zLDtNaI@%3wcp{Z%|IVCJrm*Eg%|<2J$P;-+bU()HL-G3H)w=0#{rOr7n#l&R;Ly9o znu*BlauoE}6XnwZt$BvR+On3$$@P1I%G$u%5#g1*KExN|;kCb$8K-fg=lw;`fr$oe zFmt6-K7!;XgtybaF*XeR{=RHmV#j%`NyXh{g-wR>9UAJ7nL7ttR1|?R-{{FIhDA@d zmf`CmYMXNqrWR&Zc$n!1k_V~Hh0Bc>H#A(Xh>Mi57ECDRSyAr&+`gUM&9TUJ1cbq~ zYyKwQT|ynuEp($?Ef38oohbvdbuN3c8<3=+I>bzl<^&V!1@G!7q8_Jl%nc)ywV=$$?;oQ&4TD-$jM7@9J zk1@Ui1~4xzpnqW${OI%*dnz4_JVduxbQ@O$H zGh`h2%t<(5Q2%RmR#{U*%0Xatjool2XbMQ=`%UKDtGE7iGGS`PBU?Zp#cF&xvg^g2 zx?hutY|eD~Y+|i%c==r$i#H}ROz<77X`7$6iO-38@tZVNME^52B^^gKbwysE)KUVu zsvS1TN*}pKq+v}#@=l>-aQ%Xh&_ThcGDXG;p)x#q)*Ifo(qIPN8TY*QXKqX@+ov|^ zF?wAxM(ZB~B$H~RfzpqP>4%HmZhw4F(-*Q(LuwMH88;l}rbmZPoXP%@?=vB_C2sh_ z9Wb~PP^y{M6}VQ%&ZCQ+EVTc0TdI-pdklh3l*O%N#?2I{oz(cd5M8#}6g_3(23Po2%U+Nbdh9LS)~TYpVimzV5|Zl}S&S@^c-bRwD)2_N|pd8C! z`k}slO=}Ld&BrWqh1CwRroSFIJ2>y8u{cHPdQ3B+XH^K$Ae+#8>XQu9{1VIZOKv09 zitlB}iQ9#d(O6x7;Q0%2mH5YB&I>B#Fypy(vk&_v_0c+dN%Kl?8Y1jMR$A`TVH+|F zHf^iQJz5|lr|5AbU$hHJcoXq!>?p(grM1g1qG{IN&sX=!Y{zU#L1CMjHeBK&K!{Ic zY<4GA3agCA3H08Tf0b~%NEEpl@C8?cVx&*SY)|%bwUquj(#G8)LAs7CKNJ?s{4nD> zb707vZkSh6SZA%V5UC@?WgsR-( zvzjHN%(+mKD~82%iAI1sWIi|KsqFRpf+5p>WLyDNSJbMiFJ3xsRBz|c^n_(F-&Wr( zvFhWj2rK<7VXCxpj!h&X<7`M#e)LmhF(uDwxTf&>J^Mt#1X(pbAhr{BpUx+(_(sYm z0J>8xp=Y=nS6Tl?yJqK*a{?kP9h+XGzaXl;GAW2vX&X{=Z;6|kDRD43Hpx~4wjX6K z?tDoyoILPG+Cm&>g#cq-gvf-qdwLCKaCrw$V+<#GHI<SGpC^N0A>bZAjwwK9dweDAbmaniHLhH37)Tl{ zg=f~q%A5Q+<4#er)Q>Epa@whb*R(Ahd|V zqf#aG8ShhzjBiPHLLj<3AJ~?78kT>0c>WVRY;k*fP%~wcL8WaP$jJc4sd{;`J`bdhr%PPT+&to*11QRs53!EC@rM5Hf`idDf z+wF5n)kN^6Oc?w+KE#m|Rw?bN>W^ddVk>$aGpx+cKh64gPQ%|$Fila-WMAS^L^Jh7 z1qV1zo$z8~4}0y{;qLFE#_pg#hUxaVlB{d0ZSqurIf|dUM9a$j=k6OL+3VSR14R^_ zH(gU ze;{MLu>bw0?#8)ac_g(da;OSD`J;JU781N(cP}ivDjWjL;=(Q&@1J|Xa-^b**M+10 zC(ijDfzgq*igX6Q$K;dXatHlSA-bdaHxJ;TDu>C1lW|nxmqq$|slR~3m$SxW{@N~j zHFo4n2m5=k(FnWTWYp)+7hd*BExDj@6~46IOv$eo7T;(c(Jq(`f9%lco@Behm-)oK zgEuhT6|=_kqV~6LTPE)C4wH~D&>ALM9~mRA$cmcwUl04Xva2{fGn2&9+#;Tnd>BU# zcKZdhHK1MIWk?i9FmF31f5h>?soEiD+G*#IA^_W1NkeK&eTXe*u;=$r^Vgx0N`*EU zqKh_U^gvq1&a@jRT4xIgGhq90JG0-QiOyFKu%YRPe&|r>m_q8i`h|=R z^{{$a0V_EhHvNx|F3eGP2WEx~%SwwP9YX!P?XvBEK`u{9y0BIEV3%&YFwk99!4^_- z9+*(u(|7uy#S8X6JvL@n?K~|S8g;J?z`G4BTC`wDm;>6PW9bPf?%8KiskBa}&65!y2 zfXo#eQI}oB;n+Q+E4+~-?%$*!>m~ihLz=9%m07K4N2ny)ys^gm_(j{3HZUy_uqbcu zi_NvzP5c|4b(?n~@f~Jx8j!srQg$1hoO69$L8ea+hR#2^l40k!tP~LkX|(Fe995zo z*;DxkBAt@_euXg)QZSk#_&8Jd5Tg?nTiEl^==AUV;kSH0&JVw6I6&Z+on5gXaryE39LGt^s*FpEQwHRbG6k_N z;VI;U;toGrrm8Owjg}*4r@AWs1RU}cUw&op!ne4HR&Cz99}wxI-9~^Fn;3Ai8*98* z&%9L&1#Nn@%^v{YK`>kJ#)AujTbEC}GVplXjNmA2!fXwF>IcEv_B>TX#rQf=8^%uV ze0%)zL!aYg;B_^RYhGJ$9t(e}HdKPi<09(fy#TD~(ex&Zj~Y>qrppNN}7NWqc(|zMkSSc|GaA zYR`CW%6G{fMIZaQk}s%pT5L{mR56YRq^FE{dGViZ@@QOEBbDX*;u8R+POl!h4S|SzvlMAvgLnzDs*jlctwF zUFi!+cz;3QA!f!w7kmdg_|y)WFP&NRXi-mbqFa?#Kr)m6l8%+aW7ItFy~9EEl~-8M zgE^Pz%UhG>DcP033~z0yhqSV8Eu7OgOj`a5^OjE;6w0hs4U=g22v$(VA7x9_8Yu&NuMx_PbEsnADl+y8f z=p>A{c9_Kr5X+MCqC@Q!^*$~<)IMUbm0`7-9y*&nuQH@OcvpdAJ*9Pc1&2<9z_(ue zn$x^KW`bV`Zu~kWvy4J1R~59_DmU4nLPiz7!&=UV4}%wsYor>^ai#twI~XyZ*!K0N z3F;f!9?09M>%;jdzV=lZbsj9tB~+gCkiaX2YnxxEV?)M_84X zesBD@8O9FC)Qinz(!i-1nI-SiiBXL1kj(Tl7&M;U;7}=)A%7c%@44oUpT``FXU!B2 z@T-nj=cy#-J@t~pAtZddSHg*t7D{S1*= zp7ZI3ONrApJ&9vTX&xgr^E+3w{xdP#P5d8>ngFH}FoM?IAixXI==mr6;GN*IHwJx>JBPzv>OfrLV zE_ZTH)>;*>N`IG~aaFMcn_y7AQ3(Yv{~2w;wadA(*(E*dX6X)xaaDbVcG1l}lhYtx z-nuwt$~PhzJtd!mMB|Eu7$b|?E6Kv;HM}xk#u)`RSDALp^uOCyTE3^wq4kUoA*_Mw zBs@T3C8qF4h4y{^3cBg>HKef2OATvYxWc^wH4BSXVw$x$qmuh2^&_dw{56`5#p4YB z^3^`JQ9Ar|=aP-NvOpAPq-JTy*;1)65|q}4s?l|=VOnbO!9p_j!}QA2N__*1g=yUK zisrDpL&3fYjM;3r%k9FEoqao>XibCWPKLqj=#(gy4C#co1`ayHT>3b2{L}qxUf+c| z;QH#=)39v19!;e#ztkwh$fM5QtsqR)*aCo&Y5EAbFL7u=#NZ2s`cfP7FBaf6xd60W z@)!%sV%htcQgoTPsBsg_V`7L7aEH zO*Yy1pj&`#>w|P#+$Mct&9>(XaQWmf6E=0nrW2}BMPst7N|64f z0xG7Ic)Hz5VdoPJl96D9t98<GnIZi!KTtUqAOd{FP`|FeLORzbC!!7 zAET=T%7Bd5mh4*S)?pA-nB`lXneLsZmCp?#r5Lq6diYrQ@})aPhV0?Be}zS=ZDOh)dYUb!w(V)j>d_V7ln@S~DV>{9?K#oo_Q zmD#nY&<~^o&_fgYN~-sAb#NhanSZ`M4#~C1Y<*%^DkUo1T}{ls9UzXo7V_h4fctTv zgCuV+Rt56TD(!!U0+0OUx3GHoX12!f8ZHGc;^N0s zSd+TLXz^Va=$eZ(`5fDizrt-8JzKtpOwvbUVep+KT0&BexB>FqQVChPZFAm=-wj}8 zTx%w(EM3bRO~%W{JtIJe+TzRuSM`NN$zu{-oYANm%-C~O^EXbX0rxgV&VJcRsxuCO z3qVM4&rH9HMCot$)Okx5)Dw)vPFcipt}vcB%JfmnwxKFpy*JkQ9t8=nFn*y)Me8(l zVUqpnvPAFHqHN}ESh7BvZhk)V^Zk>Z!b~PgyzwIOX#5Kj@&(Q9)v}F&QFEO|Woi!I z)mk_y^R{{5+Tx5p6?d5LJdRU~V&}yYbbXJcG3#W?rpG-E)tL$H9C-Vdvm>*3~3KStdQF$Y$|2adt8^qKTzbC$L6UCo`^jg=8oW z&?tyj4+~XW{7k!gY@_ns)=%5rAmypy_UYPCZH@ko;LWGnZ?or#cE_#eMauX{bCbzi z*ACi*LM`1K-u1=qN4){=Lu%@!KbBB8ZIr(IW{2eV*(-~^TQ6HZkt=-L*xwDzDxD+7 z>A|wV5L}(NY!#K1g76ZSdbNH9mB9IrB%i3K8w91PX2-HMg@RAwXyn(9o2eygM@F${ ztb+LipA^j{7>~axufVRY%+2;2U6`IQD8v-y(BD&SR2&0pS(d#=Ykrj8h(%n>YQF-a zR%Y*I5*_;iuc6L{roariGF%G74;Jq<-i348qY zoIW3*PTj>C;H7`VNiX!bLYNUa)rU%5kYh{St5L?m{o6of$v;8~mAtM6rk!oia_t^- z3eCF9e2+UnGd`}Vty}i)j+5Cpm0spVTf-_$at3eLaE0fWx|9y%#E>+)y^E#%((PiN zsqz9Dhg1yhh!5ZzeaW)A1ei4zK7=1`%`Q_$o@8j>Q<{z98r9(yt0|z&t$#zkJ2V`_ z{!$DK^(pb@c`DlYEK1>I)$Y|-8m*K+_F>c#Z#$n^RO&N#Su=n#+uX%^60Be> zOZz4^#RF}2{mP!F%#Y3PmeTJ+t^Cx>A96k=D?2(dn;q*U77Fjv2md>svEZ;>``&C< z2k&^5_$8-ph;Cq_QX)v@I9MBqx^`kW#<#AJ8zijbUhY#%HtNlfL?tM@;gbp77FXm; zUSPJ5@+GnL(M{{c!@Ro`V|HV75`c80;CO0~a%GnxQStDXyK5pGuwkXJXMv?TMc|8j zL4ATacn+-`;!r)dT0F>%|NL=>HL5C}*lnnD)6QAeu}ZvM9EY?&k1(7L7OlqWz z^HvGeW0r2$vUjqRTN5Auk8)xy(caE`aFT+K9;&`JeI2M-_yVI)IMFEV{MOW_@xcyp zou!_9iAGp|#xNR->+ib?dxhfCS*-h|z*d^Kkz19crD>0ejP#x^_J-qMDEgG{l!^9; z1^&f%zgsk>qEBUh-aDKpUY+I8z3G}j2IcSUnOfM?-SiP0IZ(K9sp~XGX1LG9mP8CH7POk#* z<3gN#@ZJH`njrmi$d7w^c^YOoFSy|XZm{nDsiIU#+1)&|pjcQ?>oBb4T$tpo24*J{ zYHFwXvxM1dnVmF_8OM$OzkBg%I?f^!8g6PlV-HQ@?ZGh}uQ^^5wZOkaBbr6Iz>6?H z{#Va_`xWB<`jXk(N1R$E9Zq7DiX2@f3(|;@+AEV#nvRu>1Fr4KI(V`wh+n82=4J9NwWx^2LSo_8GF`!ht=RZ*6kN%W+n^>j*Qm%ULOu8=oZ6q#U_ zSaw43$s|Ap==s*cRk4DQ8aC6d-6{zQ%s&siRVmj`(0iUMiXeTq_!|9?ABj4DbVj>r z+5i&88DiLf9^UobtQ?sw+dn*=#JkID=ZvB`+>Bm|vV3(b{h_mxb!ub50a~$gr*zBO z-YIzvo0}e))iU0K&L?vxxOj9kPX@!{ShouRomg15S!_==vkLq3 z=G09VkP__^cybH1m*kap-d0I?iLile7DN2mQg-VF5!zhA@%tK~B_A#-h5knac#AB7 z9lyRR`SGV3-2j&a0bs>lSV`uD6D9rZlS%;xZ)88I6k>6`Q zZ-Kw&bg3o%8aY;th+U@QjX|?;V3xviKpKCDCf&wXY3=YPHp#&hT=S+CVI-a=P8s+H z^4?dLMrayYYv!0ZpD*mH=*a1d;hlswQ}Gp~`4@BtL61-k-R`xdTwhQteqPiF?#WR2 zV<)hr!Dcq!OS9MdP@QqM0Lv1nmP1?7_9x&KqsXqqM@zoUq3%u=j@hd~5@5_zZfhZZ zkW^f`h{opJ8pEZGr&~q!t+c?Ol(@2?zYUv6r7vJ2?J@PHKZuTWV}V~;98uicN}u^W z9803MMKLNipPpB{EWUzXUhV;&G>B#NqPlx8rYPkiSG$Qc@r0S)6fE6j4LwW+n%3WG z8M544sANdW^Kke1s#WW>gq(dsD!MDzPNl=FfWNy z$()Ib2+TutceZYo3boU^N@JnXo&YguXQxB*9Pn|g_q=Xg+>YOAay&r|TNlkZzOgS& zawZ@D1iFbB2GbPKOUt^FPb!k!wpVltmt8~x-BR9fEb&iPy|`mEONDKFcq}&u%0uxb zETPKO6~LX8`(s#P#WuMl_R35mZBQFJom%#v-x#Pa9`FAbb?+I~WY@I;T3#$5`lu9X z@}Pha1QZBJM-dQ^P67!{ksc{hrP~0di9kSlZwVo^(2Ih!&`Sse0R=({O-cxaHVMyH z-tWh(^{tsTYi7JX_=77B`V=LW>y=TcS zTs1g%DM;NzS=O&fCsagUM^8XG$LF$D>%NDHdj1E!w%Vx3%IXZX5WcTH=uq;aCUncFTFSiXrd~z zI1k11-3I#DZ1%w)m>A$8t$hp*p#Fj5f^x0`+d5du z*3&I32~qGk3UBgF*1Kes@GhFu&ngURtJ8V9k|`^aT6~>PkQX zous`x@q5@OikQE(ZTEPEfV7h-&fclJ8q~aXXF=#izqDwYf}!T;euMe49KVGm4YDyA z#mEgw-rvp9IxK}UVb`Mpw|j>Hl1je~6u;D|aO`_KF1dy}j@kT|kSIJagD7+Tq5ZblLX!@aviNHFqN1m>7aJZB%VXTMbOCiw(~O4JA}o!y%@L z`PaR^>N)=uV`VwgR^R{2!r4a?X6o7&A{T#H?Zkkq-p9!I>lW}oVvO@{HrGVcdcygk$Qc28hN53VE z4nRrfHw>}-l$93?`ZW0PcePbM>bG#B`7{rxJ(YHw1sLn;SZ0zFKYUCy2r@QP>WdwO zy}7l5;V~)f0r3JRJ{nIK!puX!%ZJH;U7NRTtCh<#TP92ItMeV7n5c?T^jIh}?uV3* zm&-#hjW&?EFwTck0>-FIQrg~&D1ix^pU34>feQ?64CZYL=CywIPKf!S8T}t0IjGC> z736J2XIs;(slYY z!?`NwALEmzw7pU>#wx!ltGNFxvgPb5GRiYpgdSk)__ zt{2nKq**B zKN}UC`Xu`0pATL@N7mvo|98k8hphnPfWNh6)pKXWiWbeWdw-m5Yi3Wmf zizH%xK)wzU%{M3J7iWE8bS3Kr!WaHDy!I*8-?7{XzxP>a-OLE!BeY(s zy;EYrYV#E^*6>hv=X)}>Wj1{IM&ZC1KQr$bI3#$&hcDg<_fSoUvM)W}`ptvW7n&`Pno;&~mRemzeO>H?GNQdEYVT?*B-+gJ(ml zX4VSZ?eWCDX!luGkNhoGmqc^f42Qsn=cA~U1u1^%#}9e@VT;!{Bre+uFPN8jF?*#! zDpg%kSoq--Ke^$lx9!iXIX~W>jekJ3Z9JU6Bru(h%rmka8JaY?Jk*ojEV6Y>0AyK( zX?x#9?uu!9@bVVYg3QA8xRDSpi*P@;H`!=Ro2@ftSsG;c`VlKC7?G(`zp{E8{=oXf zHF--%UMbqjMBxjA_<%<;Cf7UCnV}i%g>&py12y|~LPDkQXKn_XUo>f8qY0X`mG>wH z@$Fj`JA>oYX~MxCT6#yy^=ryAJt?E;RtRBp@A4|4!b(OH@*+yCo0`^lkHH9gBcn(ppss7 zqw_U3*chO7sa-#3)}OH{#zVIXi7oT+aChN_bq(9cQ6{fbA~d{LEK#-VY5Lo{3oQ8b z3v?UI;i)u|uVPHVgqa)*ITEk^8CnchE%%|V=62ur9QSP|W8EXikRv(uhiVXle`l7> zvH|g+74gyKy9DhJX;MIxx7PXzE=@+9UwvKO<6#@4wuxj5-NTe?M@B0`#d^_h^L9Uo(Dli9_*fL<29SrQ4kj)`7x?34+_*qG&m zy|?!{kLo?hcA2pqN(D2jM%&}ZiO8@69_XL-#L@ipUtPwp`hwtHD0N z_DcfilHKPPoYvEMu&-U~47Kwer*_4#X5P%e8R8n=+T_Q#hm6On$wBfqVb{VGi}>Cv zh|0ilqyomqoT@yJsi$xvJJ#2qoqVY|VT&fx0n@$^n}su#nluELwr3X3R{BgIYQNCi z2z4UxL4C7En>VwEnCcuBOHXSp$(v#=i?|waV>|FUSa#&_Yx1n+JcdDt}X1W9ud0}6&$(`RK?$_08Jf8I)Ao-5715VS+t5ia=i zwdZyLD^=xuNWH2}QzG)+Efdfp9vsR)EDC8o2iwUW3@;SEP=^(ZaF?(2Z{rs2bwTo> zmGTRvO_YcV%X+MwHgIz>kY&C!*}OIH-TOPa^5=vKLC$A6%fr+Fe8mwhq9J~EWa&WQ zUG-OZ=xs}vy0mk!DfvctTv}cv&|`aqbeWTEH7wP@1*f@@zI{&fi2)nB&uCoD)YNH6 z%YoFWa*T&wYyXV@vv19U1DfuD@qf-Yw8dbeEWGO%I{Y*rT6a!0R^P%_#6t$aiaBSv zWG~c(?^TwkH}m>)AtOaFJ*B3sgd)_o*P6g8SAU~`Q)TVFe2fl`HAcmeTI}%a@bIGlCIM(Gl{v_u)+Y4xCpxk}+LHTaU?MWIx7P5n ztJLCe;M|XWRv%%lyM#h9$HJ_ z{qnoR{tPM09~i@RAZO3oN;(zk^adkJI|Y_k-yF9FhYyRXw zj|YSLgXTaY^Cg`_hTt3ogFWb(Ly$@K%vM%9-*L=ea$oppe^4U!S5Hehibmu2SDyy6fsE)7yQVYSz%7zKk!6(wK@D8V-o+y@z#R%xnjP! zQa8M%4p0cUVP4NxF-ef{J~Ddi+y-qTjm#H#HFN<411HSjW9{V7G z%m)_{*Jc>)MEr#g$3%qczJ8h|tG&H1OhXOE_%xm;jP@Ng+6CGPy*jwimIl*tU-_=|aD;BJS^B&D(0Md0h=`JlBJ^IE7Z*+(mI^{nm9y9OJ-owb5( z-;UPb0@@4^%~Z5vu{;1JVaUV{lh;MV@kk+)B;I_--w5;Zx{ksrxm(FJE89JxmpWYI2rd<1mcZhc-a%GDZT4Nn2A=8Y6po0oqoUTkx zy?C?cY?X~c_ZU+bG>C4p@mpQ>(E1KONHL?6YvR_xV8i7dSEvKCj}7ZTp2BKh5!8MZcx&fEO1)=S^3_>+Tr(Ie(gWS;-#15-KEwsJ;cx za~v$cF-f)#h(f80S(k~dwGqSRDeGKhnYtyzolRM6$q}UUS+AKCG>a_3Yh-JcyObx6 z_>6m)GxfFLH;@1F_;GAPQ@Kh}>o7lr6=qUL4tBm16 zTRk*xHMU5{&v7^3$$kv}z&}Z_qnwY9%MCoZ6rz0a&hpB5E(WGF691cNt~^v|XKIkT z_2MkcD}y(z#fyERqb^eL0pQs6?Xh}5c9iNm_eF#0z8xmBFfYzxj;&Q_h2Q(pCIv8) zg`vK2Y^Y}w!M8TPdi`T$Nn+LAN^gJjEjFJK0%)CtT2LOiJ6!T-?6r>N_Sb82Af)JB z?~v)qhlB=Ib#cALJ3eNUxbXpbURjp=g%Jn+xM1g7z_rg6taa40w#8qwF&j0$lW-5&Z@<9GAlQhh1f%eOj(#SIcM&;fH*mVuq>Si_2=TWkG zsKwRBa}JxxDHC?WOl#hthiK;Iz$Q_^*s<6?FO;x?_7gk!ibK+_C26Xkx=g~jeQYecu zm84NI&Z8EOaMNm)`aYf^LBpJ*2WAhE%KGiG#*JGjVOYWIsAPxNXC^w-MQR;Bu_-A& z^-w(ro0!O1UJ|6)7jIw-Mi2;O{4aKDpD&NjI30ueC5USWFP}3pF{W^0^HPb!K--+G zxoO3M_@=0_rYSgbd&#xib^8+Yy5Amz1~(dzL?DbjJE``1_Jnpi4yVlMh01VY`G@sV zbDC56aon7@(0-$osCk(V12NV{$mw7?`EW6B^`a~P*01{HSZ`>KU1ZIGVAK3<&_SD% zsQJ!YZ+2gTyXk&Y=5dg4(-GK$J~9XQ3xof#Yz{t3efn#ayINRd@V=}W2DG@!JQ*}; zJVi=^?oK}Iy~zdHKYE6%X%aPlRSU}E+i5p@c%Ifk=EJ|Tubz-;cCR`b{=>wPdaeU} zyyd(?Cx-k-(P94Ik#rmf_1QPc|M{nlQbX-|lGAA#jT8KVjDSsL~ zZChFF|Ix1UCEt`Py}g0dz1`PzAe)AGN3@xU-etp-N=-gK!JP>@>)I+6!!0u+wDw5penTWS)h0HpcP9vX zZ}?|^s|Yl7(pqZA*r|?FEAc2%;bG(@4hScma!&1$#elEHuDQ)L#_$!06js{Kf=E6gw2ZzyG_{Y37y-o%gS*n06U8>bJcUmpEIVKO#o#j{Qq z|h=>H!={2_k)%nLk=l0e^#F&G)(Noakdo zZcaf2(f!?Tw1MUy?bsZMU-wxibEeglH9aWU@n>;`w8t z7RT;j_$RzJ7F;6@JXhk9g9jJrjRjpwm0xi9bK9fxFxKb1&VTW*cy!Z{^*8^>_1_&J zT%-t7k2u@65V#=nw;MXCWrk*-aT@~lT!zZ@J+0E}I2gmeRWZgQ`6`_)>H^#<@RF%( z$XA1IV5E}=aHPp3n}EmLA}w0yO^R1Dj?sqM*|u@~_UTpY-ko+8cnXd_5UCeIp*lf* zFP(eDj57WetZArrBo??6#W!KA#jA&srZH2PVz*wuEmzXPA80aAkMzWa_WGctJm_R5 z_2j(ZeDXfj0V52RvAABni_}voyUg7;jK8x=l9W)p-yyj@n0|&ejnQy3w~LdN-I&!W zNv=uK;`*=z-0P=a+P)y{;73-t#`NLIhCZ6^iH^&W_LXCDkl-Y4(b+#* zI;md@`V=n_dGG2U-A`>V&tDR#u-XIf1KI^^=~S9 z(4;Mrh#Da{`x?4c)^UjyO@<&xcAlQd_BtC<&PUSI!K^BL=K9E#H6VDw1%B@M(P>Q8 z>HGYxh&S@LrQCWd+;8fFsE9_AWM09UyNN}2pzFBEi`sc*TT+Ts*T-ZjcX4Cmww#R) zsg)~_4sGJ*6dKe-648UyG+)G@&*04;U7WnU{Bcma6Gr9w$cLTBlAtjgzY*}yEHhX$$ti}o>pjTXm;d~>C*)?*gpCUT)gM$MrBOtp~xtW(lEu5L& zF2Joj?fppK^pNYHHgWE!-xDg5GYK!#vRU1vj36C4T}cHf`Lyr`dHo=hdYPoFxnIZh zLEL=?j%uRh4@5EYab9Jxy)LfFo<>eM(qg^(igeR$Bc|^?Z2O>ssq(nb0J^hb=N$AR z^_=%+Vj<{cfen~Bvq3d!Sm<`{nr!u*WRr{|v*0h8b7Oz+(X`j)n+WfRS4r1d`IR}G z@WSS&3&KI3B{7>@COwB;oMXTzr&n)D{dt>S9vFu@w9VgCJ~R%a(SHmm8ij!q`s84K z$Yk0$R_?dqQG%XVp+&hMa}?PB_-`-Xn8G6;sxR0-sf|G0aN#(s?&uj%c!m&)JU;u1 z6i}OfC)RU9+|zYkf1H00!(_5BVNR_Z6Haqq9Jp`vyzDCDz1%k?2Bh3$+}qj2@#$i( zLMmkkf7D+A^1P*Cc7-saTmqfwzFvH$uviUiS$b@CVV`gwA=sl>ggj2deE&OoTEH_Z!^gy%Vs*KO((uG&;e z^A8IYK(rMNoP;FApg~q^Ls@aLsSJds{=CkhtBf^;hfUghW)hx4uL}jE7K=(JF708M zO%}Yf!So|&}ICwW`XwS*(u;Ot1ysMzY5qfoO&s^L3kVnzx19V!tXuC!` z-;w5+vmxRWLkoi^x-(|jxsgY!qmIOVTF~|rgZfbh_NI!x zZo#lky$5-^Ab;j~Hl(pM!@ZHDbNCE`gEkS;KP~vv zXVDvBrBX?qcJ3S((`{5O`Ry7+gBF@G`i|#S;RoL=$}P&$O96LjAHcC|BiO|Q;Xmz$ z21nD(f?^*%25$2rf%S#Xr36n2J&2VYu4J@~F6hh+skL+mMe0)eCk{Xzp%;aO@#L0{ z`U7XPg6iyl7{Ye~A9`qLbPIN|!LuR5*KH?iRrIz=1&jLyFdgr*RF|$%=R!MOP5rjI zC9Uva)Ba#VHE(%#Qw~P|XXW|IcEh6_dL)yJMbNE&H?J#A35{nFH;%$ruI3hxv#(K1 zKIb~MBN{nN;Ew&>sA#HU1Z6ZP8dB9s2aN z_iMT3s-eAurwnq=TR7OPE2SUAFu<(^jfaAc|X_c zD7yv!)Jw(LAA06UjTQF{rytRyndrtMb+=PtwPfqTh(7&6ay=E1;Szt&?2~u?WFAl2}`MKuVJ^RUB5 z*me3Uq<#{W`JaQbdy$YLv3$P<=~hc()0Hc)+@7`vtyqN3_BICkBkoZvb5$8nf5C;)$d zRq^Fw$>8@WF3ZWq{pRtnC4QF*dg{X5o$DMrdF@05ir=q<=9YFyt#wWVi4}K=%n=60 zDb!>UT5J03`HW3tnc_WDwD>5?tIoL}zkC+)pSgybR(xVWVUXVW*U7FdPxk&eFJ{ob z&5lv}^SmvvT8E?F1N8D7z-i!uqP(1C777vVF$Z`-J`X+iO6_0GLM`zAN4D`!?wuu=3>rf59{`K?kUd%cxYCaCn56XRX;5bVzyh}D5}m^bmk96MVM zE%HPp&o%Q7H}ZLarUVm3Si=Gc_z`0RaHR$9SFXD-FBum`_Cm1xU`7(qwEt-Be%m>4 znk|jzFzAFDZ%`c%p<#uQ*LZ!Ob}1_05$Juqj|O~W>O6K=L0eY9P@gizsT5RsO&$t6 zI+S(C1mWtjy8$$R(RY&X`ZH!;HnbZIw;Kn3LLtW;RY8(u1teB^;ve|M#+o?~4^`@5 z#UvCd)SnmHrh;7bjojy)h&{}NMtKukl$;Q0ia&}|TIIKM$xK~rk1GXs$kdh&4W*LW|2hehz~FM_gu=52yY+Za#I6;HcP}98V*@%;ns+ zdoJxHOL$!=S!6He?D_KL3YW$m4eegT3Y*ki-WKCnwrFN2D=lTY2eF*t??V>mas`mW5Pk%G@kay;n-O%{WmiS4Y?eLmxzIzB~z~# zZm;%vccK?m+M+p-HJzuHcEP>JtA36v9^t|FOhX}sBMa#x8EHA&e|pzR#L8=$9LcPd zk`b58`No|}YZv6;QqrZy2Z3ef=fsD)ugjSn1%Abj-!rLDB4zE`#^DNJy$dd#6ltry zMeE<@#OYyWf1wiXwrQg&4%N3`UI1we1aNKDYTVdh2UK~3SAHZVyIa|V$iE)cscLdb z!zR+jZK}P&$KP*DQP~G^{pNO!{qK?@X}ewrIaSwI2J$FCC9Zx-lyZ%6ZL2+xCj}q< zNm|>RHPj&)Y;>dM2EYPMwUCUJ!$*B@+|jpxMgvC#9EOpVEx%`*tr6ud5*FElcmiwmLeGjujawFkaJG$;XtTdS%j97X^H*l zO7Ym8Rl~!Cz^r(NEsWpitgo`L?L@eq5Z2w?H=25Eab`UNU$3n$G(fP3v-G!;_bwie z3hHS8Hij_rjt0ewG<<6|%MTJGgr209f+HZB8~Hy8*!2VZG16H)0r{CTKAizT?f z`ez!KD2->kEL3Dcnuv~MOH8x$0Axm3vfSxC{0hFy&=kX!s6)du} zhThl}?}a(U9-MnRu=43Ft9uH%4_N=1!-tu6W|Ti$GXHJQLc1Pc)BBBQ=C}qd%C7T*jZ>}@5-s$e2x!`6 zNnDX*@wO*l`dd@tT{dv`lL6`a3i8|3V0hFvXk&1;JS8@%*+`R@9kK91HhDv(=Fz)W zR!WL&@<_#F#}seyc`)Ad*syD=SF7uy{!b$hDcdJL4Cg$-BjPu~PZcW$xFNhk0_Y0C z!=sB)(U-u_65n!eJ*kXbxnv7cJQD#5Y*)5^{LHNqG;&h76Xykc$4`6f3p11H@LiR>wUa| z4-IJRC++ZVUg5rR)mfTcx?geh)Abo0qSkT(;VIF1qWJD@sAPI_@z;{mj4Pa|`&ZOD z(e181+Bmtl?-v%+P_Ez}Z;!ONrFfegIPqT){?>bQQaV8*CrZ(1GQ~YQmS0uk(w=c( z<&AO-{ArkK{!`Y4n~9Uv_0*$6Iq*G~n&~o?B-83&LWB;u^(?}*~C`zWg^ajpV zl~bm>UxU^4DR>*&>nXJU`qRtFb_9AMlCR^rKu-#K?rzvqj`AJuJT;~}g$1CYq*?4w zjM37ujH>@-Zt!;wRYjh8$^@E}@77$-ZNb7dfAz}3{oFd?#WmuRJ{9xFE~uS&;{#Zz!kqiLG}H`m&5Rm!uyl zihCUrN97VmB?9=9fYG)5!%{>eNmIIM^2V-^;e$i(6N}RyG4<^o=;0?OZSwJSgC0L(lwLTo=dTQz6CSsR!?-Zkp^}cC`Hi zpN%cf%Bgf{n+sPN999#Yv$$^pnvV4y4ALQ*lbrE>M~cnOLRpYk>mHUv+0V(&p)QDD z1AMGzhz@YZH~)MRPAN{L!>+RwD$t53?7U3a`5Y{?-ZY9m=(xDI<6k*ym?7t3+7)3U zJb44L`{^vvY){6CcBT!ZnHvTQ2HQ=*+AkUx{AK_7JmxufZb<5M`F^rl4tL^-0G&P! zmTeL%3cBA%)PMC!c(&@)_}G;brSJS;xg-pDBOzGXMN-5HAbd#aQ{(J z1X4dK11|KZZ;kz3OgATCHI3z&Y+ldV0z_+8`E{(qYpwQRPeYXyk(z$0f6mL|lN2$`VgUd}r&<<iO zN_ACFUtr!h$cnWxEtym70J+gS#HG!7(3o_z1!Ij5e# zt!zt;jrj|DFln@T>{!DqH*x%>>7+S_rtx2Sx4Q{0f*Nmt3n3%!-Yj)v{_f7g0*E z7E~xyS{0jgeU|Z-Sl3*?GVUyh+oy$d-ivenb3VY};P_(s=}a%Ugp*3K6;2)Tdd z@so_Zc%%MCq++cApsQCOo00;D-0He(Or=udSsy%p_5PgG-0eXLB}?>=MuFH%dkhTS z1vbq?l$+=&&&JE612h_~LRO`Z(vU&(+i(PdWs0ajPi=!XQovC`e$nbajR(Cm=tz)a zm||hNtr=!6gsi0uElky$8l54U*{%m>%=keCmIc!3@JVuy+ADx_HpIXzue z*(`={k)d>3dX=@AJXem;fA-rBWmyf970Bn-s3x^x z=FK8Zb7BaK8o|S|8QFr8s!l;-fVZ~JNskoY-*rTn)=UP%UWt}Q?E*M+vI z`YpmvXSpe>^lrh>G`quyM@V_7y3o!idti{;HzJwqFBC^`d@*tq_B*&ybohe)T;Oe% z>vgl)ztgX>poFhqfpyjG`OWF5e$gu#%%L4|rvvueqP*`xz<gM@MycmFnWFirS*E9DJef*!6!8qW`0e=rN}bvXC?9==s&ptjo)_?@6|=> zA8dgIoX3hy?3$3)Fncnuw1#%@RaA^L4brtSJ?wwSL(hzo+^4pjAf3j#zkCr3se0Je zEot-NGH&IL;9?xRv{lWNjC*;yKLNpxFTn*jH=Ebd$RettfR|>^+b$*=EI+WYTV-zx zXaz0>XXtMt@Z^z7*E#uL4|^kc^U_5fIZl9Q zTQ-=j>N))!b$j6if>2C3GuQs3+uN{CdbdlRgr6hO&CTo!bok1x)zNk%p`iglNcLF@L9dwoh!U`%<%g69-5wmLD9)~z&_{*eycqRq~t5C(0B$dPm{kBY|)Jf47?~+fauBisN`q7|<2A zfvE;1=U}dDI8x<(-7WoLCVimd)qaIy1MQuzAe~hfhi=Kwq%Ueq0C(WXFS-bSE^f;7 z;XZ!)gcZZF;-Hukktvkg^_#c`9R_?s30?7jqdtJ>@{`z#f3SLAv|9=m$RlM}G?hGQ zQYFaMNtEzR9)q9=pY$SDzm0yQfEMG&3%p}drq z#W(aRkdU5!rye3Zdb*6!X@DMzbIM$=O5Pe6+{-;VnQyw+QNDc?&JJ`<$`}=mG|>0( z^X>a%^y9D$?@#vwoEpeXmU>+O^&n~ssMxFLv(LWx?F)CHFTun|j~o3luE;OgHGNb2 zEUS!1m7p2}_?%n5?wZ!XHf9O?U@6Ci^23p|eV=`*vyxW}hc$_cZ*9Rtp3UdI}#zvsZ#~E!wH2BE#1%uBk1g#W=XlCw6RAAvQl<0>F{u7 zXN#2V)4TjdbyWAyP6AM0W~9EiTZGG4;k93dR-vEhd!xhqVS=U@TTuDs)DneBN0Xp0 z8KJ$wIw>+gU|&T*TQf3%YGe2GC%B3YE?f2fL9<{BL|0mQ<;&-><*TtN7ma@oU8!_k?;s>r8OH60IE?)m z40jM`T{SK7UEX9-I<$4Rlviui<9C`OOp>eV3S1_{EuCsUF~l@CU0?97=S13CixiW{L7tsDr$p^_D{m0_JyIp3M9)~gV}q(q z(4a2Dg*kuX)?2z`XpB<&hxE;BB$qFF*w&;yT*J$x`5|e-HHJFXY??7ThNdR)b_j_{ z=qs@yANhwBzIdiLIeAB4Nn)f7_VqxUDc%*WFy5hewRKjiSR?a5erKd~f4|%NuF-yk z4l!sqE83I0NVC-+P^kKGEl^0eaWx71fT-_62*a-3C`fui1$2t*H-@#7rsM-izTJby zN6~GZS7o0B?lTX@IDP_pE$&hF5t|n`I1$iI7J_`W+8*fJ%Z-ANp+LOMfd8jg0^HFVeVObuY?lr<_eaN$XRv zvFchz{;p5Cv9U>5EPvSLhPX`U)eZ+z|5@UTdB^?sr+b%p(1TyzZZ_vMw?{6C@N^oM z*q?Y{OXT;LiOxpoHK)l%B9#aRK7x z?(GRni25E$!aj*jJH2P9Ss?pA7drFHQ-MHjxY#I5^9@R?1J2(7-G zo3KPugqWrBNWFGto1Tx}XM7J_+-b08KX>Xj(3yH;(mxCn?I8}PSk!r+TyX6pFLxf- zgLjO40p+{c=0^0vVE~aAsJYD62L7PI{uMg~>)ca&yQ16GyvpkN%#?Km(;V8CFYK$6 z9=_6swWxq*c&DL!Om-5R!!d!s@1lwqHq8zWZN|2?iH=5%&dmCkpv5F-}zJUxI&nzH*Lx>9{uVWr8XP`cQW(9WGb$mA}gRL?;T)R)<51g39a|8emPhczW7YYQzdwh_sI}K2g2%qCZ28aTh)3JkZUqpBF&=t`) zcHevnV)fne=J_ic!||!fpkU|Ub*8I^fzjrOR1QH|i+LjMN(pDhG-7P4($*qeLbu3T zJ+)VmcWFjjgTCA`&hxQ zQTqnAzRz%Hyt|^rvl!~-40LYKnQ@C)6#jN=6br{!KUvv0pGY?s) z1{bijIjr^EV1OL5+?O)cN<;FU4Rx3?6CjO>Kdr*v>po7`Odh$T(SMmYH=|b<6tsVo zT>U8#+=j99#D>+5Epq&7uo>#4Mv_n z!6W-A{D%O$kD)!faeicDhd>eM8;OM$eqGCu}^SIv_$R|_H|Mz3H> zkmDJj`FIyxM5$_RQ$wE{Y@Nnlydw2VolwO^BNHc9y{*jsH!rD`u&h`|%<(fE7xq5VS8tdk;m+pQdo8k_d)f6_lP1z=ceQP%6) z$u&WA6SgmX*f&vo%F@!kKF__{4FCKF{arVmhQ`J@niWQSgrc!PYKy#bwvD`ZQu{J| zx+W&HT{MYAV)h66W*dl=0v(RTBr81LFX@S&SOxpaem}{Q!=)h-Q9Vb@OPRRAOjz@= z&7bo|Hk3!7oJR>IWu$`8(cCQ}Z2-4Va@e@3%B=$uf4aca6!FiT!8rW|3*tY_Wi z17%>SAk(0E$tDww@=vCnnyYE(Vt?8bm*XaoEf z)jiRzJugf9c?=_v>ix5HnDbi{B8ybeK4ilOZR3j9!Ew=^oigVox>Wbomo#2bSa&Z< zy+vo|ePb>@sfZ|&1Bm-{cO#4ZPN2EvoGiO&gMYFT4FV}}ndFy($@~fKCm*ng%0;@O zaSE>&7y94kwu<)uFpmG1mRPUyV-IeVc@5X)q>iAh#jr$}-lvXN75Ym^yQQqGRu&$4 zgNjkR!XKc(8k*;~@xglVw*Jalzr_s(tub$NJB!1Bv@C&U?)*g!JYehT?WrsDfk|;1 z321l&hfsrq7@ld!$6DLozI$8<% zxV;MG_extWYw^K$)%@}E_l$K4#S2FVvGdFl7~2g=XgR9XTT(1c;+R@D#JIdjU+hy* zMw3rF&f~=+Jx1km=_g7O&~b+^8Pg%8xW8$rC@*aej_CL<83y0{&dzv5gRRugiL9g> zBzsPyu+(2Kh!6N-4wZHX8?tSB+=ZoYo{@ds%7exyvU>Ec;FD4F56U4Id9@8x>$>|Z zPNt_UG&o3;jN*#lm06PM8J4Xdw;HJj#%=%N!26$k)-GCQ#@`ZhW#Z*swS|5NxDDV?O)$Ykw%oT2===v#vO2zooZONHpvYUKuOP>)J=}S-hZ}%s(-LTC`NTnB)-pyfemTbZq)* zK0gO51j;PnFYVA}rB}56;bJ(W1b?Ec{}HuH;wa?uIA}(Rp z>?MOMr$+@F>D%g87|E}>$h6`ZAtQvQe5Fr1Qw(1Z-s8jRiON4_TFff-UKf_!`p|PT z&9*bEAvh?1W+DhtIss6{w50iMT zN{8Ajq_*=I@EY z_|txG#O9{S=E}4K`E30Yu;Dp6}Zqunb23}8GqcRxWAVz&9@*KS2R4RU{JgeX0HT8?mfn3-|@ax zS|_Jm*q3+(gL$}lk)8244~0A*4b2OBkljpX-TppDqC+jr29j0RYu6FR>)lT`L!Lnc803+=d3Vh=-MkDq&f9$+%;7uSP%gZcs%#4P~CuMfdgR-rmzy z!M(JE5wC?;i0AR@-Q%)DT4mWON#u<>pZ_oyHj|A1x%unWe=bA(e;BR%pGevNKkQMU z|MjTE(Ri5^stGJHV`7odENns%G=yNhO0wH(?vrTeT_EeoZz!0d>BZM4kcl-Y(Y@{b{&^ic$w`DR>fmOL5TN}r2 zAK`v^*3C>$sZQCyX}k&pJayM;3cu0kl&+`chS$GicH4KWqc)Bp9}<=8%H35qzZA$- zVZM>Rp)rmijEzq@pvVix(98OGd*Wv|-2!DcwohaHWilKDC4_~Jes2U90D{eUMw6G7 zy_{OjUjd!<7=TOZ&Z8U+^RXs}NxHZu<}7`J231@1?!Z%R|d6Oocbx zlIF5Ie9Dp)KWTlkV{-2fc*#F(B;nlG=x>V9KP)3F!a@MINwte|f0k!;^=&Kks9Wf_Rfb2`C`RHfpO%-Ed;|6xqu+AeISn_QPr zR;gkTnYHhAqLDw5q@V&=S1rw+$0Y@vgfAX84BU*oh8MhyyxYOo!-~HE=|BPwZzPTG zifAxuKlG+9@Fe%oGJR{jF{w5LL}UPml!|T~#Kz*piFVQzvka9K0g#Bzk5()1%q;No z$t;?RFi6ej%5($5g4Icm|Fz96EGA=b26xJ+bvo1Q@`1z`2W(&6jx-A-O~i`oHo8Ck z3Va1O&k%?G-0vVz^np^= zC02aj&9MN%4!caaFGxYvF*2e8`Z zX_)W#$l!#ls&kSXbR5LsUFu}-`bamGFdoTd_f08qO44a@)LuMf+igQIn z7Xe;^wu!>@_fUh}Zyd9-YeWIk)j?jnmuI8DkmyDLC=7V_uJFT^Qpy5^ZNK4_8H9X) zCDS z?IwVdvATP>>He(!7dxTaKJ%B9=@r>spvxKI!HtP(`<*BWAklDjHlm!ZX|+2Scj2Zw z70FnHtZ@v_#y@nxNoA8GkK%^k$R+)vX*#tyqV@o2LWIS~>I6#>8I6*9n95l1Y4iTnL);D%jV#b@FFs)w~ zd=?~JtL+?CyiL?TAE1`pTS8Ifl(_F1pbLvNu9&$LxBAtul!NJv&5VSoA@@S?8EMp- zn$n;YWgncOom?JXUul0}KAWb4G1@A%?Rk3&2+5F-rviPTrXBGTfVb|GlG2U%VI5vc zV^n>5fUkPAl-oUL)CcrSC1bCiLZz9n&cr$IR9$gYY5ucM)nYa{)Z2FY=Ok5~K~R*~ zA$9x*)#MWuc7Q^adjCa_C*LlYYf~5_lCFbDm4;4*mig}lm}dHN?tOK@O4*>O#nVq3 z*9RI-be+Tx07~Z|5g;+)tszVeSw`vy0fT)C% zgKUc#@hW)E1|auI(5`Ond?DqlBO|_VbxYT}nxuaoTw@jT$Ap{Vd)5Q4jAxpgeeRdg zy@vVzpsCjkEvf1wr;BSRZ}cmY8zLF`^tLB?5U_A zx1|pt%QdFR#>>)O+Si1lGKYGHW(VA%MS9~zQ^RROM%u2w$`1Z;$=nl+HNETn;MK!f zIwj!~X722=<@nAL9ZSw$kc7^v{-Epk|S9}uoEz_dae4(>$Cmz;Q7J89@W=Lj` zn|d0S?D)k+H3|W@bc#0xqH$&7N$0B@?uPkD1Ni zcsS|)L0yrRch7Y8-h3q5=%p0F%>QxiRs;%x5#dS1{14A%F5QcQnyPaKxQj8GaVgIl zL@_PsArv8W_iAj)QQTeHC2=)a zpcc-br+(}2V=j;cApu>SoObky1lXW*123t$b*7)a1&j`n)^f_KWXWXbR`DtVRA~%9 z7aSgFhkxJ*M2c|HO=b+@q(Z4Xjd%yptU{q?VG6U%k*H$m98}M%+2cWhQ2`hB=sl$f zezbSZPJWK?5tRR~^Bi%!-vcVwOoStQ1eu!mp6KKD7-a>{Hfn~akyf64ElR6%n2H@M zhi&Imp1jyvV_{+~nX0}_H&THNJS8Iv&EU>#2rnAF(Z?cZEF{9!{a?R7 zU7OimtgfmfyPvMtecfgwDF|lc2Ft}ZjznZnvzXnkhKD0nb^Eg2QtB>tiocVtowvXR zlMzd1nk=QCRc3xY-C+{6D8x?>60DDGU{&rlf<|Z>f7HG-QTasvg_kihzDZgnemv6Z zgCvC{=XwPU#7-|hYy?QG`GF#vh0P6t&@@6P*Yr~|9|@)0X0ap6wRPUcW?p3xqN9B5 zWDkE5-=OIM&F+sC2N>LM6ia#4KZ%Y52lB!DM41qJKw%A?P@(%|Rr98WQbtV>u-V`u zF3n;3w{#AZ{pY#ht3%kln@mIR^uAh)Vu=^1OMe zF?5Rq*J0SBuio0C($_k$S)Mm+KTK{{s_>PyK5PdfyXIM#^?Jcc<(aM5<;a7 zQyPVzkENaLn&rFfU!rkbG4BYF6^Y5)8~t$)JU!ouN0QS88xK}Ac1KVLzvK~HxLc(5{Jfb2}D#NbH-)>1)Gx2zNOHdS^NAcT7q;gUPz&nJq`d0Ae0h?e4Laxnq2~N@fP?-UkbE_y_(w5I9%DrQSY4zbFe<5Wmvv8{ z>P{-?R~~N_DEy%9oZPPrs3ybp`3!Y@v0ca3BDtkcSqeq3q3jDa$np7id5OcRJi*B) zafOZYyafE{OQTeYx6xJ8FGpPGpp-e`nuW9SmplO~wUosQo{F-gK^t+^8C0Lk`)u&) zU?<#G9{hCL=_c-VH%woR&XDIt%0EoHsK`P&uJ4eyV-L*(JHeVBY`!G48EWi>DbX`0 z4qL68AG>ga6ki;IV>2KpHq3Tn9`N!!Ayqsq?zbd`%C$1hfym!6@6Cz(aZJE{JPJ3s zaFS+W>rhQJd3Bhr{)0tlQ3x4)LZCRg>2k-Oaz}usx>ujNvS^zIu9yQZY#RGr@!Y%p zmG8IOwo=?fcNI9y`kmGVZj2V2`r!KW{Ea-u&d{6{hS-@Ro9#+rp@M8EV#P)x4=#{i zfiEe%>f2d_I@PT6pHZf>uDvBXmZxO%*y4mLIUcsZ#w)C(5i8|2!?_*eFb}N1GTITF zRa*=DP9)XJhZklG4ZHG(dk~U)w96f4bza%I>S62O!R9-Sf?9TqGhLBfgQN*ok%DjI zs?N564f9>9i89@2o_~N1vx8a$|Gem>fJhvhStW0!s0_@b>J8fOwL*wW(>NQ5L*HP=6~1Jz~ETT;_vqiY>_{^ zyo7-Yf|F?)L#ogy&JGV%}~LWs2zJN9oya_e{wh6swVpW&<4u z38u@t!v6UFc=3CpgljCqQLKz;rtlfC1D2H~PzQdJ83q6Z-zJ6E&G{Ohw`JlUIJwWi zLB<1lMAYPK-+kID728N6ly&gee>J|I=`l9JE5DMEI>%peXZvcd-nMRK1^MliBc@(J%TZp*bAy7wxxvo@SZ)+}Johvfnz z6Mg;t^3GGR6<2|C3$}XSr#9HPtSR?^K6RBrv)=&+@6sBO~&Fu1-=t^t~og%?tTFrL7-*lV2@zMzQ}rwm@cs?jXD&$5^feMa7UKsy3k9iJYwT{ur%} zkax*JO|39$B)hxJfiWo6LBBcy)y^lW6WcJzGU=y)Mz}Fc)^?xTo<^iQzY-oVN|TY5 zA#e$YfOB;a^1B_H5Kq}MbTIna=lb^<BGQaWaLMD*%&#u=&EXqSLJX`--b@xQj5CdEH)df(Q9y_Oi)M zf$$b*Fgz5PSMlsF42rlUX)AigzL(SX)FS^3JbbjDmF^>GM_A9EfB45k6?*Aj;8wph z)fTfGlCy1?)MTeF7|v43s9WFWIp)qFGEPCCgEHcJjZTP>e-bP?x?uDPv99aAI;P?u zSe@*M?Da^>Nv8d{pQ$4&Oy>hVeR`b1Syb+AZud(ke8rRsB6PxZyaE*S^hVfnz|-?^X~Rr5qjYFTI%Q1h^qj_mTt zw$$h1XlF_Jd-hb+6HJv{eh$?282 zYpEWe;aQPa-#BE_#oHwp>t1MpW9seF870(Zqbp}5b$7?pBZ*9JSnM`q0*~US{C^1X z7Q&Brne0}~^vzmiT^55J3IEUlX!-ibeq~mQ?4F_8KE1j=`9ph6L&-3hfQ_H5ny6ch z7Ptl{mfZORB`-7{CY!H;~)l!F=UGIG0XiBR!5N+!H%!!y@i%P#&||IhJWAj1rnu@ z{P=~N(wH2K;88N|Plo;=dMb1*@rIU)^AM*Q!ZbGY{!x%^S?fnIztVsg8AxhrDMd^!OkNe>R6xy2m zZ^d&v6a6%j25HUBD~u<9r358@ir<-g9dCaBOAcvju6qQVmM4R?ei=v(Crc|BqpvvSUfszWls{=+%#L~ z6o@$=!q@gLc9mqFKk#~9>9auU$>i7ZDY*0#NrZiN*=)rHu0XA`Wx11^(DTK;O9q8-e9ar_A(7a%!CvtVWezl9q z|9Ow79Q&R{p?JdHK8E*)ZIh750G0|(U1g>CEu*#eQ4sEKL6l9s)XP|{Ud$%$NVH}O z63M&*(NoolNF#mJRP5HL=t;Dy+Vr$~XMpB*?zIj>{np7&cz`6TyJZv_?i^Y-DnRP; zNOu3K$WzmHyMwm6qGfTB0CvC%XT?bkilo+?HyEP(Tk}dK{m&7VVr#-;MoMi>khL=B zw|6?QI>V2xB;-prh}ozU7YcT#DRsLKpZX%e!laprDg@FO2xW99`*wk|@kw1vdF@q2 zZ*2idLtWm}E1&jS3(wrdutU4nM6KZv_Atpxi?+_s2wVV&)O?`M5r+eQRzd{J5(gVx z7q)5EDdOcO1E-r#ujdsx2Q{$M*Zq4Wg}rIF9uA`?A-NbYbW`=c*cLm}$K>WwkzH98 z`s#-wS5sqkB?jNwy?<^9jGEzH97?n~ooMEelIpqJ*xyRN<9yx{iWcdTRa;ZTkOfS% zJva5K?4(^-wMW9BGruNDIgQKIlL$%%dkMD-)U-o6&4~4p-s=yz$-8(xfy%_(ISoN1 z7~1`|bI%}-C+Jr~$SXdvijn~K%Oe3xEF;nxmr0n<^dr+iv3JvE4+J2c^gWe8aa4Vr=H)?@PnvN ztn4&cYMvN4L<7fMw>+aEmagcJ6 z1(jNd1X^p(egh#5c8U8bzD^<3&++Sir2Lt!ii+rF78d=t_Z@nz3=%d5!0Um58p$_1 zoEo^e0hubxQwyu#5Q-ISkB#jX_ad`04P7{6h2dU5{c%$|`MS$A>G^ZNHxpCHIj=`u zxxi}N|8(#^?xFo2}4V-)NIrfryB9>sDtZMFr8~At3?$*sD!g%QU^Y)NH6D`IB-9Xdh(A$;nwhk7Y`*ijvik9vWSv_{iyWy5OjY zg{AQu=kpO4C-YagvF9=H1+grVM#&rM_BpEAULC{EQQIidb8HXVa6Nk^dpbaWvwXjH zOL^7^(ZiYb^`5+XddPXl>;TjjwG-2Vp9p*S{L^RbYL!5 zGSs)~oa~JJ^$T%`Ju$(alO9|1Nkk&=ID6-}H3nDzq)YN92Rfad?j6%P_0O~L&kCNO z?{|8>7r-IJ`4N4}^w8vTHg=jiQ+UH9QcU%%_sXllmn>|d68~pMA40B-{Dp z=+1QQeuYZdZeybu>Sj-G-u)y>oUqedHadI_uH2H1gTkEj!$^JJ-arclJNDk_;yQkz zyZQs?{=U>ovmCq`ROatWiv}Oe@2x~zU7s|P4d6}^3{x+;4Dd~jMLRk561oz8U8X;3 z>XU6H=kUw(QI|mAPz5b-GspCvMIA>|g0xK~bT`w)w|=mfPk1#}oZ^>*Uy?a4kMCZ$ z{Sn%mKVHUm>c&WWVE#T(rTIB=B7`y^N>p=`%o93&v#g`#l+up+YLEKl}l1>%7 zmH|fEPlYashZ*3|WN{v6+GB@3;;BY{hCm`>;B3+po_QM<)ju4!m3|2_s8>E3hpypa zZm}6|l>xM>4k3-r&&$Or#I`y~k=t85k%D94g!H1wAB5oV3tK~0W5QY_yG?*jma@qk zd#^E9g|6V0QebiS>ut&XjOY5DcC9MiVWGYTs@xw_mpdR;OlRP?-)fNWroN5dYnp#9 z^B3ZrUzp4-g4qyy`#9S=smXAaV4K^|5%3LAqX^SeW~2|Us~nZS_4L{#0Dw-oXJVCb zG#+TV$}d9Ky3e*7zyEwu-QC`J$S`n>LS8cYwyX?AL4BNW>CP;>Mx4RR4E_Jhby$U! zdB2!?o*oTG?^q(eFo!lizAmaI)kXxr@fM~Q)M|lzr5ZyHBTblqO+s9im2R#XNCb#= zcAhHeqpw<`hn3p$_CoUnt$n>P5S$?0mh9q)WeO)-lJ?8&^R1igEoHswj+VRD*7%?E z={gpGiD8pRWi+J83CPJI=bfnMb$1$1i}eRZ{!B`PxB{LKIPtf9F7JtY1UDQ_PA%fBL#@!}t+>94S{eEfp;pDpC3Dhn+3 zOisdko}T{lt;>9n@2}om&_&_LCd^Hdrvj!1v|A?yHOU3SH1e(#-ijl4ve5nAZeSI&XaJ zY29}^1-QRpVT3|QhdB1wOxF0oDk2?SB+%vI^OtqsEg9+2V-uRFTPU+Zf&|Pqe=N>P zX;ACuelWB>f4V0pHx@u~`j~-FA2}5o|24hhXDx6n0kayrrY`TV|7~tsR8&+^(^+Hm ztU`b~i_uYx(dAw{M=M?~e(OSx{k&>^HufyBv%m%N-M6s{*`+<=aU|GM^#9oD1a@LL z1X5etxR0Vgu%H(6ar}|*nXNNl-U$^7W#SJwvDJ&GRa+hP*s3-3}@(8yW?XL(Fg?#VKx1mpiBAo90 zdb+O%2XE>M2*?N?hfIOtN7K6)zZsAt9;|`nt0$%*87TfTn#d@i>Fc}MvGYt%Hk z@Um($u136s*ab=*CayzFw z!d>qiG>KV}mTgNwXS$d)HwgpUc#CZ}l#HqKvo?cuyN?oE%Wd~p|17-xRjyq99+i~+ zc`+smBi68%L=>tH@G1m#m9)fzJ6_ocK2j%eFcqbX)xKtQ4W?w^wuY$o5X8vU`;%}iSAmwFkVufBfsCAeRrvwq#b3Mb`*$jblK0)JQw@Jv6c0{v7$@BooLV64 z-$Qd=k+1NHU=D9e7apk&r)Wxj7VxJcte&Nq3Y+Qp4|eOZCV5=wZ?F?MCJuEb{KQ{AfR3RhxWBc&Qq!7U>T=T<~6B@Pq<=Dd7rDrH|2 zvwCEs1X?E!%MT9b2J+}D zuG!9dXfF(ozK3sLzR>I5Qx-6r^L?|(;T;X0m6UCEa1|&+KNt^I4e2lAfBnn&AxzGMnqymiS`kvt5592m6iG?99bLl) zGLm~f-ie(oL#(~(^&uTcB#thEG>fK|+vSc)*WkV*3xLMwmhkczxhOIIRm2z1yT=MO zSSDImML<1-9|py~i_gE*INS}GzdiF)b#~I8K>}a0+m^%fK{uAK^s3%8s3@P5@r`~* zAYWEk8#A9yf8-xTA_rfp29cTtc!Wa1E%A!&j@+fKS| zn(tIV^u3EUEGoll8+`_p`RUHvV|6|o>q#Nct39HxqXtM{#roUL>9q{HSW)KlVf#kZ z+%Q{|h1rrG48u#rzy5~#OY=mM06aszwXw+k3g{aiC(gBc%#iZ|V(0UQc;U^Aa!`$O zf9#7Vjd$VZ-7=2cry7Te2`|(IUK&aVyCkcxj+SFt>F{%$OTRNgahXc@6)k~(SFOt- z+VxyxLfqE?m3$zs?Iz=N-3`RJyaqs@oBahD9|G;SS{WWZV z8~BR+4`2RvVf=r^ZZ8#-9KSTuyiIItBV{(3dVN%7wgYUYH=AFQaKCK^yIhfL6A_Gt zkUh%%!kdI4>w$3zSb!7}t;lT8@#qm;tnTi@j5&OIEj+AFb zN*8ng%*CLF*cmfiW@aX`@?$vLgPeX2%;s3LUEN(Bn|bsZ{M9N?Ppu_x=0#wQfmGwz z>^Y)0WTd4>weH%$n+fo*3C}g3zqq~rF@0w-k1};ld$4fiU=M}&ej#X8krRWA*GvwY z=bozuWm+U#&Q+d0(Yn~JbgVpCZ}CzKBH$WAvr3<>Wr}yZTNq9ifM0O0$7(0U7VD3q zeFAnCC7-W8YGanvabSl7%NH}eSU_t90f{z_@L0o>LG+w!m3()~d-rxOk!JBrY_fLdUqlvP(tW@Pkf^B8Mqs88(eGA*nV|^ z_mOE(rQ1EcG2VX3>=p8X?|0^G(deP)FYI-@gDi&r?6?YN+awwd#C;Dx|NAs zS-`33&Fk$f?HSGZi$3t zuNXe8KU{Ws=%~ZCRr;lf( z&!_|g{NV0>!HvidMA|f>IE;F60IAbPCQM7*SR0RWck}23PAMGvDA>5{E6}Q*nwc-b z7f0d)B>JPOREnb$8>62A-MeKaZ*dhB6`OjO+NU)Qo?7k|(jJ2>hGFcstgIQ!)qz`S zzD9*psj3*|C?Q4VLAP-ul6n}$b=(9iPQ3c(kG7Ra9yW*@5)>m1rJ}j=-hwWjon)@y zQR~}_;&!;}JxxD|b3efFj!6@)K!LhDhyXCngNcs*)%}*w-mM(Z^BF>u&hVkfyLb3< zdlv^N2pAg}FeM-*WBogkt9+P%%XOgKA)@+ZG%COtR?N=HSjeMM4sf0Ny|Ig3h{4V{V4*QTUks6<9%Cbg2rkV(%NeA1sc+)a`_nxdi#Ro>Yh@%&Lse7L`-^KYotBPl40JQ}wtVKzNms?%+> zblr9U3`%AXC@z*HmH+UlUmjn@XC>3n2F)FgH|lJIhyY%lt9`xU_Re>^(La6dP4*vy zoG-?i5|5z@B-b!6(GU>uH-uXkO8@)Lohr`Q|1w-@NcwSpXDIlX#Nsmjif*Q6j=SzAn8}5P}C!^m)3X z%-^`j!k z@4V8%gVaP}R_9cJ&zU|M^ofHOL~i~$F;?85E~>I3+opT+)=B;~hDAsS9P=QZ17`|e zA)RUKbcU33IMp)5TtYmKOLLO+Hzf%~TPxE6At^y%wH*K@UiAtvm-KyNL#Z+ypuzn@ zjYxmRS;NJO4sYQ||MXh$dI$268}Ty4VKt9VXbzA5WKQ4PYq)t(^OoV=U1LDUFgOIi z`=U_wo`~!;WgRTy$6>IRwZzw*=k?a?s1(MT!>}%2CDIPL3noAphRXW{qgdaT;&Iw_ zoT-6zmYKA;i(lt-SJD#x>=viE7S7#rX6S`lW2i2q$6Q>u|Eze*8LY1WK>M8!evY$r zj?3cnIhYPy{q$Jhbb*jj-;{*M1KX?hO77lHPDKCXXvX`KI-pSlR0LJJP)@~A`LB3d zWY_%Ot}b*8h!ANLoIUrN2W|{&-hOV525MK#wVjWxhEO0_(dZZ<8I`mJC8f_TX(yg(xnd; z?bb_BFjCdN-7?OYpBJgnJG8?;ULr$KStloG;_IUX$z812b$s~brS=%``Pa6_{34^M zkj724sGB45s(spyV`QD>n3La)^LXtv@At-oze2Rplp)J#xLqv>!tch%p=vpLgo~|x znM7e%lPHKZ?msUNzXqVW<_)jhUSu>h|6;*G_;_|Gv3`S!#+75ui?1BO%~t(0CcC{y zRQcO(a~*@z5<1D8XKZt#=?|Pe1JCl132?O1tp1x7im7t_vnRh!h|h0N#PFB3HhyGg z9MZJD+!ol+Fu+-wRFT(m+pI7ix}}eak%+X3a%gLzaQzV8y(qFIeCBkN{|sG~i_vo6 zLlW-9TWU|z4?E=e@(v8eq*EZ#jEOM(p<`Ri`Q7Nndd2+%muynE(~qo_7)rv>BM z&C8_JV0j-6`CY`31hR}l$GB>IQpnmsD_9ZPMwje#9;ZIw`i6$^U;*CbLey!=*Ut)f z+~`NzJ%$?U{IBoN_KL{?gNb~Qm}5A2g`R%J8#4TENAW;%HxofbwIb#>E9%*0nV{Id zk<@XgT9U^G+UZk5t*O=OSXVYvM;EFSQP46Remy{YGi>oE_YG@*bo1c4A>boCL$4}N zv{O6}_#y$Me7m*t8kN;D#_niCu8boz+Mqa#-I3Yi7J^K^1fN@O5Yn*c>LPhEwL#N= z)FxXDqk>I%t=fSD36NU=Z}18yP-C^)qS#Nf>!ceL#b6@mz)dUPjoHv)BWl0QVTkCV z!tH|{%=1CqCDtcCpS6^Vr=<_)gl2%jKcUROdB7v;r6{jc&>2g-&z{r4-Qb3(0rv2K zY6nrbUG{o0Q_M-PmrXwu_EYfLj|*&2W1HWR{0l>^uxcAr{jEcAa-wy%cfw?H2--2W zMMJf}BB7z^rKZ6#_)azOqE!)c-1uBu<%rm5l*+ouTH?vCa)r{e55IfQfLoIEg5OQm z`)nkKv;u)>4L3_98nt+1?|@{D#>w<&+k|D-7G$mi`3=xR8uyzs{;0vE!M>Zxt<)|i zXSKBj%^xeHiBfxfV-8VTPlFP~b-C)_?3#F@m0WhOaO@_bzaEgwu@{#eeJ zz(jRkm?WsSU<;LZylk`%0R2vSWGPWTH8cmxfs`{ z$gQaY?fDeC=a}oV+A1fISOh}C7b~FPA~nL{ofL+pbE^J6vzLb8!#?4(8s%wO0esk_m)z5SHhYy;ky_%shDQ%ejQUlT$a* ze6T#HArCIvtvMkSda1QpOtO*B&o9*6+X3a$hP#!NY^6!~c4`ohy=22swO^Uxz9&9# zZs5ec@C+_l+FJxza&8*B?pFHCH28Z1W+~9?x@saAcg)uAGK}UAmCuj&x`2UT{)M@JJgUZ&(G}z zzyyU)u23ecYb}0mxZsZ@0OkI^0q$s z6KZ&+9o_t86hp)KtU=u|&fv94JXG}X?be}iaZ+{J${qnaYKaUucroTZDw?*shu*M8 z_g1HKB{W)HfagLxj$Led+gT&NCaxlpY@{z(1ND!A_N0qqxbRmZ{y*@hy;Qm>$waZ@8UQKG@9De{kG|52vTFSolwda6IP=rPtio zbJ__vYYi~SJN<=5^-H;Zy;t_ds#|IKyhEd*=hRbd2u|pz}{hp8fjmRZR zuL20aEC&W$>x0Q(1Udg+@ouB4FTr>D8izStW{N=mVk+eOK-@%VK_l6^UI!C+9I5~G z#6Dr~v`ssqcvxG@hONbR+49E$^w~PsJX4ARG?3F_;0RS_+jQ8|+-#U$Wos#Eq{vL< z>Xg_glRU@`N?B!}s+b~taHa!OgL`nD(97fbRw50@g7L%zlb1Hl$al%=j>zSf2(v*F zuU|K4ed5W%{E02JaTbBoRg1e=@Pb{`G9oTIK^CU@PUy+Qq-APplJ9QDA#32n%7NKJ zC9^Jg>ZiDe3I&*G!j(Z>66@iWPG~!t24ST(sPGi8`SCX)AlM|0n`Q)|AV7`)lGoXg z7|)frU9uiA5(Bftxf=@CE}+)6esSG>5HURx|5}|Ej}?6CBpENgnwh^n{Kdqr`_DpcC z(sZOt)vPsBFZ*ax&oM6aIC3+*jyRNd>Z3l-#6}Dx`^Ch_b)Z^_We)Lv3wNWY@IpnX-2P3E(fXo>Esp# zyVo`Rc0)#%=;K(R)&doY5lc{uacvz3b?;Z*-Tb{zFMjKSLnyG>%R(ZXH9s#^MryFZ zdZ=LoTgu5dVm`~C%u&ItfTdre&rj9F=P2?m-wbpRQ=3oQ+?6`lsqy$~njLl5`VI$9 za4s!)IZ&hWBXoXhs3v-`{7DP@*V98Z`V+?LseRSo2ZadTUMmU-*ZS|Uk{|Z_E z)WqUgZuyCVd!6??3hcrHM@Pryl@+hb$R~$O;L1<4cNZw9MU3?SrqQgWvGo_02rc^G zmf0(CDLBLQk}!k&6rqK~v;1ndt^il-Acc8ozH^EGNegiJ+sTmgzC88I2o2O&> z{k=W3sUO(FBaezn@{2ALabE*@I~ET3F2isJu0Mu1prIf#Dx56s^298;VOv!r%!(SE zP3`imZyde1YsC*Mbk>fHWj2W+px>z6n3-TA=beZ-hcZ(WN;zNy*d0-p6qHFRVRHy-_0 zPFzgGnjU*7WoIUkYbEY&J56*#WzpE1TUZD9>#ns46-Qj}W{XMfBvkY1zMM0|?0Nz+ zu$m$*8&q2guSv#ev-9L>x0r!-h34Y)gA+iVwt@eMIhel_+WP*X;gIiC7*?v=N4i15 zc-aH=m3&Ec3kSRarLm}?-)h7lI$0=2vO?M(^2|244-pi^_NO+{-5I01fkk3 zyK(;dOwGPPK%;t$kEEHqu+>Z9ij78;V{%9K>~5Q~;k)`!*>ICq7g_*&0^!dED4b5B z;!K?W3GwQVV(bR=E%kl&H1J!BIF@$TbMF;CFj&!P97HO^8{2BJ#Cr1TpnWEmG+;*= z!`umEuzJ*5r$x!+aq~WFMI4;c`sGCdV46?QmQbh-A614|rL#=2>m&*`eh5t5IfrH6 ztnH;n$bW~_U8dE`U_MiA+&niv{7!bBI{JD*xn*J{zhyE0m=E-B_FyJ6+NNQuw)v@0 zwjX4z&#xHrmcDyHv)^QVN}-u)Wtd)jYH>O{sSFm;$C) z<+{$3>}lVXV!3;0DnsKQ-Jrs)fB68$ysaI1<+8JZAhHHiRwtLQ6`pg8w%zPrKm4;6 zx6o9vUIP*=`p@DYe>K`}zN&SVozX$nClC4E9&w?60OTHdYu9*@AQvqDMvT-4pmxEu zBH=H{f|l`D9@ew@|84CNy;var$76W>{67G(Q~!5_jAdhI_avpEAuW{p@d+VT2{%jQ za=Cv~i5ahq^-Gs`$10ko9uZ(QB-r|%I9TaRb)m$62w~mVpdK<3^460*crX15B>8~oQg-Tc`S$g;@~?>v|?J0yB3mUG7(UjWn|!0#Szn9+2Y#@<8|{M*+UajQRD48!2OcL7yLwP)e;4 zb9v2=K&0wAyg%xFWGre8(GhZYaL81{BH!}5U!kxLImgs@WrUba-PtR44v;zx5Wg6n z`_UNHE@-k=o4>Vqo^x`RPf?UAWblMK8KQt?BdS39Zytfdhw&{x3WN5PdN4SX#8=n5skT7P`Xz^7+oLU~GJysNT)^0tMp6UC zDX>nX$2~M*JzhJx*aCY#MaAg6yNL;2qX)pkdM)=hv1@X&vKPT{B&46?aA}c!i}kHJ z?bCo*&ZwrOEdaWC;f>MsIe;`;&ty+his88<%Y8mLp{?vSc1TG~>D`J~w4!&krB@x$ zz&p*8Yx}YhaG=7h?e);BfOT^wheS5~Sw;A1K4k05!;puP!w88~jZ=*~e-|_fI|@}H znBLjohrmZSYwqgX;Uh~P=ks08nK<0`a8E9mw^K!>NVT^MoZWBkRCE?*JKSqL%j_Hd z4+;D&_PTOXeMK9Fo21d4c=C3ma>G6WjeHyK^xK}Ef7F}AtGf07oJwY9F>_>Zgs|b znRNPr#`rl}(r;LnIAl}q$(~HuSB=o2AZHhZ=BrB5163?WbNwfv*C!v&%J0a| zQl6=1za(#9kFuW+k2!Patj$dN$pc0|K_fZgnO>KB(K-7;LbWw(a2CDtC9Ls}Yu&F4 zoH0|_6vof3bVy#6$*ZsQutt$Jb@x@6;a7ptFyDR+MawhzELbli zqE`094u>NGa<28cI&ja!b(b}5YmkBt`;soUW4(~aA9+YeZx{2D@CnfWZ3CE}#w}mv zsP7WCQ}IG~N*6<`nDC>W+pmp4p*NPMc)e$sVb}5~g1)HVrAx))7>?Bz*#niMkMT6X z4cI;3)pQY`q;H+I=4ij`BGYj>rTkGvUaD2-?V>+uU|hp@c!eGY+XdHu0AFLbFL*xk zwgcr=YyVU9Y$h|ccMDjrc5(~mPGD_Yk!(GC|W#M-^ZtapG~KTwT-NKw&8lr|IrVi(evYc*CjF*-RveIG3up;+BIEZ7rab`Hf#9slrvSZ z-U&UUM}6crxYPAop9TP|4(Vl*< zV8m*&&l6Jt7U2Dg?SHTgpsX8M!_>Y)D^8E;B4kj#*Y}MkC7W5uLm8!cWG4$=n#R7IxDmG%z6er(Ln6 z_UY>W8FVIcvvl)@7Ir>*XC8(H<89DW1#<4znoROfOhCQK)x0gGy27c(qau0fD}Ify zN+QzJr8HNDyt1U<_4v6(l>d}WC;YW3us=PLYTkZ^nUWGI=e@76;EguS=G5M#)HX8_ ztsQBqZ`E_nKxPr`tFU_}jC5)#X8clmN;{g_L!#ytL(2M}U`k%`a`@z;UXtV_QarhV z`OP^13I~I56NDgBosBu|I>4vI+;rzRBSXn0e%d&Mrj6@~{XrJ*zxOPjZ!2Gndr+C6 zDB_Kfgv{iw1-!a!tv2~UdIedYTH(!-2ufAulKes9XD#W+Xbpu8(h>|G-Xp)%qbdi>y>IYd(Cs-{(B3EzH0JjXR==s|LL!#b5*j}KiRy5$q zS8Zn(nF{*kaB@qUU;<$6sDT{Kr_|f$=Y-!DJ`?pdApj+kY7(tOTxL)T8&AwkI$`u% zO>#Xc_Ec;xZQ!oMYNGX&bt-^-Of2(18HiyBF(J{%O&cTzGKx# zTqC_XF$Q|ayl*@ls1b`%9H6zL)z!~y}NE7j12BtSw@I)oyf&=oC~f@B867AF@`?%sFRf&&-~^e|yi~<}5^M^$$6%HPUR6U$&0kQ+zV} z$;@dpoTRfTMR&HnpZrDyc7JIXOK=+rMG!`wnZ->Xb$aQCvt-q%F)pmrCNXYLT(b1D z;@++{w)rkKJMD60R``d-*)BqT$*nOu0Ort;X}L{{&&(=$!XBKwAH)l1d=Y&k$BJJY`7E!zx6FY)>etqk3+`0g%x$$*q zF-{X(lgfBk5u0_CUrJmEU!1rdv_MH|Mv9al!DJp<6RSr@-FLBi?oaoUua@ccG<%CK z@#(y{T8Zm#uCpkY?>#iKbdWHrZlp-8K5yo_VBt4(;o5Z(-Ia3Ztu4mqpDRO!8Oc}X z>2>Kl>5t^87QeT}cR4Jht~N>2w|_i(`};v`#n|O-kri^%2?vSMl;+W8ig!!bb7_zC zuFdXVuYq_qls9s1CwbRd6S=AeY%1uRE_e0xnt$V!FthN3zWdo;*X1G>(=6jk7gdem zbMy8ZXB%_HjXC`nwQpHnCE1zieUHg4VsHMcUYb^YbCtBJzS-^?F@Cv-VNstdT9}%N zOJ$T@C?va0@V#V{+wf;o)Ca>m7T{oz(BtG+M8LapIvIuX z;M+1$wPswgu=l-yv;~iB(LgG1cDWbIFaAaxT|e*d$JVvBSHS}Qp85P+_a(??C!3QK zLu;YT^WMl?HCxl#Iz@DPQ=1jN=a!%boBKAhLj2wnbVBwYyTV__{}&th*e~b*Xhv6x zBd?NdFr4KJy~qXe-(QzUn?}>uAJHw2Hq~jTTykD{zuc==iMzK~`ROOcf`YD&Zw)j_ zBO+v|cD0zgWLsi?mvgH#E?TV6R-=^oL+@8-2-8B!(WTO|zXPpgktfYx>`V zyRVqKOqL_ov9a;fyEDaiOW_s`zd9z~Y^r@zy;H#TLekBLG|Ai3)-&C9J%~CsLwlFo zC&+Gh|NZgW&i3M*#0KxQK#F?mL~4mm1Nb4_a-FTEkHZb_iZ#2-JZX0rm`;0iFuX1Z z*A{tCD}8X_8db|!++urymGi#_EF!h)3RTaSt*;9nWR@tYaAsIpMEuHR6Ysx#0bXvF z2qz93``gJWb}xeXt_elymKXZf5~F}Sa(vv&Ol=2JQceMF=}fy zruaoSO-Pd3TH+^Dmck(PgOp1OTj>Vh$){_g+D47vqV+E?NfE>p)2d__PD?SKumNAlqzejArnQ&XEp{Sh`^z9M~EYgZX0 zqrc|O+f`Ez`)Nf9a=*IHvG?8ZtB*+6&+M-bzE2eP%fH&RDhmcP=AOAYosu^q^=lxk zv&Z=S%NQ~ zp?#71y-yFoS6qCD?~gQBNT=~&4R-~c^2Cpgi|^fVX-Vv=h5HZISECnhbIbfn67=_L zNxQLZ^rJ}y=fwA6awnHV$G2!ib6Ozz#kPlx0n%LLUAo8jnbl!NBR`DxBV$zO{fB64 zv2S>(IDiG^c@^y)fWz(9Yg~5B9#0MkQheLLt?63(MPDB!aiBs~QNKDq)IE{=v=r@H zFwvg3P1@F9Pf@MYt+X_kq}fs#U!ml%v>nGmC<3`hjvGFReR4ub+AdYb+Mdi%4|44c zo8N84x{ujDS!JIK=6+2jt)TDD>{)Jb6u3i~^yB_E^=<7)deP!@ zCA=vsope24+UQmJ3MVK$O1(yzHn6myk~)Otui2Xq2gLzOi@kXzg@lv8?#SGvv&MKW zz7a|747TX}{`LXF;=%LKNSpt7<42Hn#S~L>7zUfMP<%e2Q43Kp4EQw>Ks}(g(AbEJ zDJgZDy?SY5DWLGenND`LwyaDk2%oNe^_Ah^{Ju0o_h3<%Klt)HfBuW5>ZIH^(pGoP zpTzsB-G9Ov`l|u#nLcn(Y4J-Q6}Ip;7V?n#KbEH={_DSV=rcCeLh)rh#_2z*+`3iy z{Qv3g{D0AL%W>jqU8q^Oo$gCywcnd7csF=Y06J{DmT z4**;m5+103dJ1v<}9mD2laf#Uyi{sS})Jb)$1 zD-ViNBR20%M-=@Wq5IgOHPE>At5m=}z(Z@p;QW^MmBqc#SG}`Wym>)M3_B;#C0q&ClBEu!csCdNt z%|I7;i9>MGFCx{Cz-W-Pnu%X z1As4&DsVX&JSJ>W7E^D!eqA#5#}LFz&U!NQ4}p$kJhxftTxw?ykFWU)wY?5X0$#kI z3E(}5VA2x^RWl*^!Tl+J$ZIfPF=*Cm#|#tR4kT9oo!lmxk3Fpl9tG1shkQ_^L>+t=>CN-HNZ!6MLidTa8~ z<6=cZMSHZ+@nsr7mh}K?Y%Y|p3C(3+L~Dco+#~YS*_VB}`$lcCUGk{z{&mg+L{^NA zWcQH=cLY$`_ns`P&F}o)!0Ws}K~tktU~BogM|N&6LTTEJ%48eIwM&irh*05<0$YBF zr4yKVTeJlIqP|eMw>Y#O7x;@qFWiSc)+B#=<_O~Z`?1Eg52F0`p2;<~1Xu5^TpMj# z>xL!*c6^R!QD5gv?-$$d8mt1e7OlD(fBjnb$%%1&$Ft}qZ)hAAA88dy0O_5ykdGhB zKFL7w2uZ0Um+5lXK+5LfeMTW+QMHT5+hdH`2{N&*xVv7$+G`4yv!yrGwraC+_RODq z8l`Qr-Ms^qb)<5umBz9&d!hjB?>H*0+EZr_ zf+sDB&DrL6PwT9K5zCJNGQ#s8>M$qWd2{6~S@$>aPq=$#3(p9Hm92ycQmU%n#!!yC zv$!x`c$g2-QF6t!?5zoM^{1{>a$=#azar3LzzO4`QL2NTf09>bc9@KZX$d1h@arvi zHeDRSi}-iB3e3y-lTXaykIjqbswivk)?%x%(w+h2Ve&j&cs5vu>0amBxz~|uhKO3* zW@)*7QA~FyU-<$$rd>69WN;I7(!2`Y{IgI)c`YvLAMzEG>ul_6E!%RWxC%$uEb0r) z0XES9+vb@9`G3Ldct%P3}$8@S=gWr5zFCx zL!R?9R2~9S>K2nQ>2lY)GDV5Nhy6p#H3g!MCDBg?9i}TvtX5m57&Edeg8}J+Mu;I| zmBdNtnsZg5XE8?$?r)`S^9?{zn~XuAJ|23q;OPA(w#E>~yq%S@=S^&+h={p2QYQyt zUvR8(3^KMM#+^tly$JvUPi;_M|00XRLhlwnH_R%*yB<`Nra2X8s-gO_*sOd_q!f$X zK!A3E0s008Z5H$I$0MN7W22H1P1tsd-ITjw8IPJ5pIlUPCZCge(w6=2eCDNfjgu&1 ze%GI$3}Il#`vU-s@lgAi}BXb}fMY<>K`Y8_EI7V?`mYr9QcKet@xI z*>v{PPEV`-^i%y-o{6X!Y!W`kOGlpW+>GoyJjS2bb}ky&tJ6bXLcJd zXlWsO=E(l5`nfM;Lt-KTCc^QqjD*YnqQ6Tsb9t(pmx)ORIka--hX{S z3e`L)=5w^0W=CcjNzKAiAhA?o;5H(xpX%!~dtuoe?dVxooZ_MH7G9^}cDxytP@S$h zr8v}%4e)S)+px+|>Ozy}nDLkm0fv6?D zbVNy0n^i<6NX_?0G{G3D7D)h@Za_%6t6C95JfW-;vso7&X0*t6=*5gkY;Pf4e$-sz zzuvXEdqb_SmnNIi!_=I_$RHzQ4lP(+^J4!Mi z3N97*Ew>4uwp>hN6QrW$^=+WF9Rai1J4bTKB&3P;4Ne<;Ec3Ks`HWhnP#D~M9iZ>X zv)eOMx^q69+!0UQ-XAWeRPRZmqvbUL{hX^bPnK>m$WNkb2aQ@1T0Bvfdr8h}EKEH| z9&^XPbVg%tFS-pO-)Nu(Q`)+JKDS-gFi8%&j_3DP`6TU1DpnGbsn~gq}kD7VddtfDJ8q1Li%VLD^S+v_B32rs1 za#n?P1z9E(G{^f4Q+43Kkmf&8`aC7XlGh6MgOQLAHAmdE7ow)04c&gN3glydlk zzIiYVG@9m}_Oco_M7d3hqexpnLXUjL9N?w)CqgG+v5V2Q@#Lph73Yvk?g*Q;Vi zAKA7W6jGs~GnHEhitVB-pPis(qdxpPDt#YXSmJX+79N;|6r&y6$pwzP(|%#?iB$IH z{3h?g+%?I|R*VU0qN950pDwZkq>1HxL;Fg#s~a~57$_|_4|dxgMsU@CRkxosadmvH#gHNxi=EPD5M6$-@mXQLUS7L3)*_9c*jfT3|+a9huGyF|j35n@s$lH1Az#wXi zR{YH#Eol*9uJJKS$Ti`nwDy9PDaSYgfZ0C_OXx z1Zk)}q7N2s3#7#4#Qm06ONl5j9h0VEwej{Q?%RcWS%0GKQq}TuPKrE+1)|0VUT4AF znw%SNr%mbw1AN$t zOAmw8%#v%y4{4YeKMRQft(?sHZD$cn15H@B;4+8;FFQO!Q0p(Xj3juyWX3AJP=0q= z$Bo&>Wp&ro*3E6gj&8O3Y}S*03v>yuB$^>*zl+-39r96K1DJzhBs9517pWKqkxF)- zew6{8Qo<<5CGc2y#@1hL3}@k*iP6~18tRwN<`^&y4~tyy$J?P```h4_&4KfWUa~$j za7q$y_Th|iIrVoqe4IfUupm9BHxk5 zgVT_tbl+yy+ikXT8)u@%SZ5atF@d5-y{It^mt-dA zoOie!4F;vJHQz?QY2txr|3UAoBD^7uk~Q*-#adP6%g9Eq8QTm}kVVmV@vJ`xv|sh| zBfIz&xU7Bb!UO|T;wSG`tH#%@>vt_Q#Rb1&O-lb%1S|DfWglFKnpEvgqAQ8Ums!Lg zb~@LVaVqR@FZJL;$J;2ndL##ZIdnU+AIUgQ)MFJHX) zC@Xt`{By=>Jf^mtLm%Ok4c=eIFYD{PJ*tz|tHjgmu8*(htVLPVCtqMq%%=suwl+YR zMwmXy(l%Lmsr)7mJ?!~Ku3agdZ`+4Y$34l;CCThsL}S{{bJxnPMf}lT!G{e>?E>#= zu-%r9Y`y{hd^!K14(6``PO$2(E%Tcw(%Ig&ypTQtN*IYK)0aal?>2D<@NKE%(A&p3 z#kIbm0pB__;`{j+DP?x9Oe-t9f@vzL%no~GtBc|nLYY>VMH$WlDI~4#x_utWB0ci3 z59a&q7;MI8{w8T5R&ch;*q4sf>o?f_y4L8i*uj$!nT^uk@O3K{>N#l3Wb4qjrj-*f~8ZB-J-j19>k*lOTIv$qACzvV^AzUL5&e# z95zHHTnP{P`c%#5fdZc%hO<(EPeu#FoM;GvJjdQ z@9;!Vp)r~c&+E#3rsq7uy7&on`=urt@yZ&93m8qc2`W#!jCo&vd+kqrBX&r-0z%}l zV54dxrZf#Q@OK&Pfi_^Bc?iJ|-bWIuz?J2ih(y=Crf8e;5ZvT=Z&(d-uqIDE@MShL zq8`83rG9+D&Ei+%ibt(rm=BuS)8AhMND^h%@jUf8>6^kC>cV!dOCe87C`{a=jhRqw zZj1Qu9^p3{qMd#LvPK};2WF4>eRZ(b()_*i3pl>AOv}BC4$Rf6hSZ7RNUN5W`M*9V z)b(4p6d%Zc_Gg1mjr8wa1#Rihf!4h*sHl!wgEp~ET?r+UB-_-a81Ugks_z>*r|NcB ziz-9tTzA>F8yvRexmN)RqJILJ8wie$7%WTGu|ZGE9Of1i9^&N~^BGb%s~~EyOu`-w zPvJcyC)T{E#WD^x!_OHZAMSKjlEp2OW_~d~?I08uUB3eq5Y3G1>5F!l(c_9^K^Axe zvyxffe4L(~vK}3nc9#1!Ti@Ffc%j4lkDq=gTf)xl?T+K2B3s+b%n21M;93#OlC|pu zOGpaLaQ~*7YFD=@u(-*}V@pOQ55jLhKGC)crAZZ6>*HqX3)2>z?L1wOnQ45Wqrcp^ zx~Mn-*;O|+@bq7+;%y;NQdDkIBZ?!>cnL_jKG)EBpt>LjI2^d)T`6fKlUk+NnLFTN zVih!98<{a1dU>#Bffr-vY$>l?W!S1r`1zZ)>Uli4CqoFY6TEqC25Dk$35KAJ^CMNm`P`=Y`Kw>_Tt5aEs(zYZ`+}-klQnp$~uqVAq}wi(JmXaxbx0C ze^Gr-FO)S=N|0g1(U*Xy?Y)l}aS^VQ3lMUg3OIJ%J9`D|It4#3MHe_!cDEqjBh9vv+#SDDocKg;Xmw?r!5biGC@ zJJ-!#(0_BWZ8-P8RHa#kGmPY<_}sIb|BJ$ClyJ50z!1eLd&qwE49xMl!=;LUe=W+hI;1@i zd76ac4aRb##p&QWO`W6;oG+|3;;FioP>SYY&QYzwNq(nu`YvZ0=^uvIwPJeOTuM2| z-cf+EV>iBm4LC;cAROI(biDEF?z=Nh_9olgXZN~?6iyyJCoW+JFnAa2RUo`hNA!QM zuvhMTH~T1t)dzS|T1)g=yi;yZGM|kcdwA9d3(OQ7Yn;6WqA>iWBIj!BL_~Qx{A4Ei z<=t3hI-63jMdFLMN>Zf9b9G#jb3gWows!KJJoP8xO(hBWXgSotC=(YG9@70oM#V{s z!&=Ips#=4S4h+V-`T>k0)_E40DV{5o3LCBtCoOq{WIMxM;POo}zj~%A3gHUoDU$dj z<(irE0tYk4O1b5_T{5Ti=mx&Hck}9o>!||h*BQpBR%>t?D{tc?aq-Zj`u)RzE!UHl zo?YLNf&Y;>`Ks)HBu>6`_R+I?@BTAO@ALl#0{joNt3o#5{f6u5bXl38!4I7Y!=2O& zg9%?QUwCU+a0(kd4A&m>ECA_rDg(Cd1bqJj&1$u|m&F1RRMT*LLl@i$iFjUk_$R^3 zj^=pHL8>@iwTYA}Y8w#EYcW01<}o%B<#zJo?)D!KXakC=Pj^7!2LKNF*R(43u?`Ta z^4lHx5T9x7wgCd|8@>C~Lhf-~ok#UU&TH}84#21Joed7wJ3hN%gY{I4~+<11!d8PC^<6Wd~xheV)45JDytXCNVYw8j^ zy~f)PnNLRN4ZZGzH1ZPN$@P8Cvw-InW$*Q5X&4yzf~VT767J@^@s;SBkrMe%_JcT; zJV4sZQYE%&bC5YytnhXZ84|E`t7B&hf)u@E)RJ+?m_m>%!RM6b*?llicC~Q_mY~#o zjnq92+sDN3P00TEx{lS#f24g>JhR#T9+&_ia!xH`nH=@ zTZf9*&85?;n^k++gRhq!hisqp@EaCoG$u`c?V8oTBM^QX1fLN2SS5%0cDj%E{)Q{> zOuy%9N>@gk0Vm&Pqt)aq&u+LKO`*Ht^|H8xgLcAZ_F^GiZ;z+(5{1)Zn@;Ect%PJ@ zSZmlHMt4xv66e%HaPI=L9>=JnZCbNg97BgQNZ4cKkFK`~2x`c-Y(x>T)(HkfkbKLr z@Q(ix(Y_}aiP&JQWHJ$!*)HE}2t+79tB{y$_CXgdnYEYi6QFDuP9;e9Fm#R6~958%&T%PzzZH32L!}D$A?9U36kyLhCeu z+D7=Wk_XZfjXOD-F!&wh6L#>mo|noy#5v{aEcIh-y=}Uby6+Ql^{ukwF<<*6>mz+G zoeWb}X$2>96yNClOWRtu#E!g#1{c9DSD{3?3D)F-)`kpfq5EJf6DIMOFz%8MP}P$$ zo;5h!+j{8^i=(}IiTX?R2{PoDr)g=K^J5=cxI<%^z0sLsX|N!Lk0|f?Oz1UgE!rv| zBPqZ{{(6WR2Cx=vvW9xvS&>ryU0}P1oJQvG%bG+|dA3KD5kU;fm6B5!m1kW}t12-6 z29G9T*|D$VPT+7gdg)eJHH=uIlk?}rdt+nfVqs;}a{eu-1#PQ2vfs3gf9}EZ&6Bpk zLmF?RePT3$I?b_XlHK5@FhD;{0%yET)eHubQd)C{Z9|dnxB~2LM`@VcyM=+ zVexL2ru|G9QrnK^k*8s0Y>h0b31^pn{^-!1Mg1#W^G6bDww{ZkeA33sg5Y&vJl_x` zaDWNe7AK+5noFM`gof@iDuYavBR7A+CuuC3Bk*9uf(Ej|LWh?RBE3+PQ461VmDy0| zNTIy%ode202bC>AMg#AI8LUSp7Nf{3)dhHjDChjsHOzK62%{G;N8kE)tW2m)p1Ay2 zc_Dxypa=AFFrFG%$I`Z?kk6PpD01x3PyFB8!-~ItJX->%aul68F~D=Ut;tx8X+NRxcQSb<*o63y!@q?u~MqP-=S zl9np1k!7DYgZl)@6^ln7e7l?brig1q2v{=u6!pGGZgAVvX;cDMq6D68%-x=QL`RDb z^zQl>ozHpbzqt1;rav~}zoFa2UG9I^_!H!4Rv=%^r9T-Y`tO??;{Q!ZI@{S#Oa1Ph zSwPOJ{(svPPoMekBG>!>b??1K#4zrA`NVffV|TpP_Al0gPguw~{nGC4n*n+cinoed zjK}S%Pel&s?b}x`c}RY@CMhknsTyf)Y-?l}&UWPj<+L=m#W6T(hukC%WtXd*EYoxP z_oZ0Uy~j+I-}*tlj#6p_PH#e9$7qs-Oqn^0#l_Q{ALnA9g>Nxhi4Umyo${}Q{=e%g zZ=VsjSx)MI0~a=L$*=DoEcl+4;{UGm18#oqYG$!IDSv|{7TLR)I}usV1UZ506xSjK z7Kn?2C#C0PG)b~Ce07!cHtEkdY!~a*mgRNeeBV&kX|azV%k>`1iL0bPf13T4rB80Y zvHb#+IW0Z;x%s~n?|&Cuet4Y0M9&BeA+8z4Onj$egYF9IUX!z{jSfFy_UCkH{t@J4cMp6T zZ#d6SzwO*|GAz+k3K!W<`r=$!G=gWbO;5(mp)F;}c zW%4s6uBb#NNI&Rtdn6FCkMxZ=6R-|G_H>@garc00tTF|G$EzCTMME?Uw|a)APCost z(3tZyC(N_+^+rm^(+;~DDgB|x5rrmc_;rnwq4tB{$|vrEfj ztpVl9%?)wLX%3%nGz(Edk2ZH>af{7wpFIxkn+DKz(d9cAg%w8fFJ82dINGq-tRMB6V@1W@_0+L$>VtvUf#W9FjgBeh z;H5=$RWp{~s9lG~(n2n}sl#0{*n+FoO6*wjq!BVqXIhB<_du#JonOx^_KOCF(=(bG zoSi;6MS^C;Q9KsM>NAuO=AQ27Mam0z<-TEHP;T|wjns{?K zEjA7}SjJ}5_(2Uw!`C^Kzj`M!n$~mlG;QC*^ZyK0WWakK4vx&=zT~ceoIHo;GqSSc z$=7$h(236VvKP)%xR-0DjgQko5r3DVcj%R;O-=^RDb2eKATXF$ep6-#JanowjKA&jcHJk_3w48lIGa7Q;D3zpm#YOt6h*0=%UC@- z%$cR|Uw$XrEdUCZhD?I0ihQ%3rjSa}H(5PB?3_v{StRq{K01u@@~OXKGn|pZG$T~OG>R)IL4T(a3Tc!Tp21}#Jxlk??^ro^_+Y&*Lhy>woZe;|*TOM>7L z41HT&D!p1>Pbk(pF>k`AyiCALen_2}(pS;GKBmAO-d#J?MJDCng;`6Nwe|KbcU>yf zRP*N@`t~%m0-nE=Dy|UK{IiuM=@UMdhuB&su5LhpVg3qS=5x=M&jPcwE@}Rl120?THeey^HARV8wMv@B_Hzj?$YuBdNvLX&mU8ML+P zpN^zPq=i_SOf5X-@c98SBjvOT;T8lekH2ulyEN|iryie4`Ryf)n8Oq&%+?R9LUO-vUP zQkldsWQB7+2z{nVJ3>OHw{+R^Xz+OhLat&=G!|LQa3@oz9ID=0p(r7iYow5F@jTiVlCztmmyx z0=>uj`>m3)i-65iDe5!&6x<*~dd-i7ycVOwr~{w+d0%NA3Ku03^I2NWsaeZYC+)VK zWj1pQHJdoT!sWl0O+Ko}F^45*0H(4jY3i3_Kp19Ew6G`0cqm9v8|g{P+`YK9I?=?# zR%~nNo`ibj3GVYRWM#>26*^Ylt%h!Eu0s~J&Ezz&tRdYW_1ZU~D^@D+6(yq!KG|az zCCovVzFE1Yzg7Bsd-5Tx64gUG-v$&r@V$D#Y@M$G&Af}d0er_cMIx*blX>{EAA)8f z%5%oZL~!f6Q>pdz=Pj`JR$8vI%0NEzGB)r{0sC0X)SXi{?F0G;gd(T)&d?U6k{sFT zf{PGmi>@?|I%ixE5Foz>KZNRRFFsKi`CZ>Pyw=99{4pHd(8eST@yV^z=m7uzIyDQcR2hLXeq5iBnvA5ezIJy!1A4xWqz)n%(9H8{!w4b794w?U;8A$>cZ_5^wu zVpkQnz~56wGHLr%K#^b^8nc~bH*U$`D4Y=auz|&N<`rlWPp04r+S9}G)uSODx7KITZa<@|kluRxZSRT`-Z75@9SD2R%Zg1G@;CYvN;Z(|!ex0f%A4dr!ASq&VV z3RxP<6+k_3RZ!ZD?I)?Oj1;*FQim}oGom)=A5R?FCdT@RnQfTxs}T>qg6gCEiG(BF z{HAwfZ6bCx_m8Hp;lSSB@q9tuKNnm)%#=apiV`2Pc@VE6%5W;;r5gM2KIG%yeD;gd zi#z!OObh_X1R2?47Z{wGMCXg?NOkY5ndKEg)l3f<)lD++m5R-9PSv;-)c#hbNAVW+ zi{)3nFl-QT+c%c4Hcn?Y&?oRIIY--A=jElT7hFde*(Bd7Ng1*@;jR^eXNcA}FxQ)6 zU{ifTBnNZyZk*ll*O$HV-doUlYurj69nD|o`9L}&Ehj1hLcLRdL}oC}cDa&sIeTz5 z0w%-A#7Q_B;jMI3+farw2`REKg=mLl$?jGvpy%dh-Yi2p z53<^h7dFse<6fE|CA~i0zLGFl#AYO+FJ(YYYLlM|Tj@U-th3R_%qCwwULg7LkQg}t zFgrUjH@|K>)6sH(%Js=JeEFqPPxK(%h!;?!ByyD~9W2Mb#F&9sM53feJ)bFNz{p5* zWO1FQzpW@u716FvX4!eIYgT3y0b?{Y$lP`6-7N78Y(W&M)QBG;<^pUk&p72hb(@+y zXRC>hvD-@?HHgJzb(vf4MHdCt%a1Vn4=hlM?Inu~g0zWC##wQ^Znh(H>k%&;;#Aj zBN6AeshsTP5r#I@QUuIPOq_@3vNqdB}WSoq5^Mb+js&xnUp^xq6J^cY$vt6OuVX4iIvfe|yrq7v)W zA>L{nu}Qa1?X-|%$9D`$TdvKJHd%rkq$@10+kaxoA>Q}t_3@f-K8jiq>I?`lsIqvP z*YY~yDaw`#yLWN#mFFjAIE>04Cg$2Z3&HF=|DKZ4O2>)q?(O6!y6U-z!vxI|m@n|d zd>iFcDeL&g#?t?y@r2r?i*3hK##0O8%}4OX8@PUidf}cR8Y{d_7}R%(86T%?ER{)B z+IM5o>^$mTs-)))2F2`Ru^AZT`OEnqYFxaYs%IS zw3psxiAP}NofH*H%aBY$s%7aT7($K!$-PSfO2R>I)$j?#NvX46QL=<%lp=F`X*A}0 zj#YcEX2Y_oDO+3$(ia?*S?*-lG+_7}^_S~2^^}wWKA`BN9uvS}&pMgmliV&#!`Aa= zbd{WlQWG$Wt2Qe0)2CbL5maIR<{e)Df>&Cq3Js-(P9T#Bx`-E2di9ZHO z(WoCbY^V5jaT97P*I*dyy?x)O6D~uc7Xd47^*Svs1b&X2n#WUxEek`1!Tbi&qkGm! z*8JJMnJ5jgProGlg5#)|;)L-KCBx7WGXSzpR<6Jlm$~sFaCW22!VVcrom(OBtb|IV zXGhnil{PuY!7YVyI|My4xZ67SN5EV*&^&ye=?F_^xy8|;Ck|^hz&;;n1Q zP0&y-we^Jgi97~IH78+aOy5WLS@7F-&j}TDDVMrvzg-idwyq~W=zF)6TJ~6Fy(!73 zum)^4xMis6C*l%e7$3zUk#}P9@w?ue@d-k*&S!qU`R>%TLP$ErgnRdDX9QP#4k8Zh zVr}|ZWr%l!M+)A;#&DYhk6J|>C(UvvB7>Pw`^%r^o%y6W6;`+YyuHP+3JMc3wXj@^ zdn#rdc|QZ-_=CCwR;PYU#FC)W`3G;oJ2%Z9bl_{qZqxM929h?w{!ytXf-jvQ1%3SS z+R>7;pnGQEgZBeatV88=C}Vn@(7e@@DrsS0V3{HP@{LMR&P2IYk_FTI$g%ct+Ofm) z)yGajaFBXWZ;cC7oFB|Sv;-$%^zVlj*Ahk+7H41dCV>lyp=Kq*Qk*ZJL||Tjv3->k z$@+ru_vAKRw)3jb$t-x9dg_Zgmnf2K0}1ZR?*4pp_u~28Ln*_P(sKu|{srR1sIYvg z3}@skJW!7C{^2Y)ZL=#t%X}CsDU484c<*LthIe1f{IS2a=v?r9g{RlY0~y)1tm8i( zF|kJ!4|Vb;+?%0D7QJ`<(m@sf>lIG#Qy`}zVT0r+#!SM8l%!J?%u6(USohDe*?R;8 z2BPg#+xvdJj5H=2%^@Cx(wEh=MGs73_L`gLhnB&|jqll~KvbZ10;tH3t=3 zejB*FaoEPBXIBN3LCjjLC3KLyI>5TRGZY0643e(1%v|50=?!&=%>1mff0~o`4feIu zDV$XC4l#Ow=^xG~`feU?x$>TnZ6)I$YnIOAV^No44odB@pRa2TPI9K6JIL99N}L)Y zzU?9cLqUHEs+||wt$~qUGddH%Ldnq3-4fCh_8@KkPSe<_`E6(Q{;3J? zw92`6U%5|lo6~>)zksYWdGqvnzkC0K1@5%u|1H?~zvE^9S3mQo`AeFrm=LQDet&7S zAkzW>;jzcFdu(B>5naNaGmzCstOR!cHq#A|?Me3v$o~a92)fqh12dMjwVG3-XRwuR z3sY^Xi$;mLdX;3I%<3-tj<_~o>&QqJA3cfT6Z`hH(k%uwJmSA z1Jn?bES2HReXD_2^e~WtMOQHU6Io(BtP&r_G_r~cmE)B@hN5ls>v)sbPUBnC2t zHLBtt%y%6K@g6Me!mQ?EsEC+h3>7-R80Oo8^zAl1X{n5(j@lqK=?K!u>K$v#9d&L7 zhK)u#quK5>xixpm;sG0m$FKaZ+_?LtrR7CqV`e?juMwI13M#O_vxKfe_ z{dMH_Ag%2XjC+5DI;$%lGnc`HS-c%90C{eT{!2w`oh_kWqVwP-)aHX<`|Y&%;}E3% zSa!m-+@Ka0yOLLCNrNOE}a zHs;wY;1C-z|1cv0-u0<-6s5Jc@O~eeZEks#1ZwC>b1X@QGxDU)!7)Knw>84Ccsi|r7A@j)~ zJj9TMrXB6M2?MgpHDII?(o-5K`b2@e#p^KXoz`s+Y5#&JRkt1t4jD$^S=P_>hx!7Dv`oQwqsZ98O zArhMgWlufSsZHDtQDkb>fsB$ctCM;CEVf`L3rd3fTwGklcNDVf$UpVx#CKVJl>(I? zambTLTsS}XIU@a$a{+u;5s3{10NnGHkO~TDCV)Yqnu?ZcMP@F2+s?{z>CvGE z@`?3Fl9!QIudw&8fqg9~Sy=DR?-9OH?d(;5We}R9gM|-yT;#%=0QZdu8UZHMJ^)|4 zi0S>t+ot-cXZXi`uzGM5c2)|hlx+p}fKxo7QtkQard()E;y{8lm$HZg zuatLq5P?P0Y~u5EgvrSqIrymG!v?)~vpa%a3FKMxa%^mmj$v3-HDmun+j4B3eZ&EI z3+v}ATNDx5dT4@+viFkCp0Q4Z^k|u0N5{!#g{$2jRXMY6>m-*felllFKu`MAdm&-z)I!|_ccfc; za4uek$&Wc;&e03m9^BA{Dq(S1v=qMX_AiE8Y>87$qNR$wh#xm?1^9Zx*lCJKg<#jV zBDZ2|1C@e8dNC2%kRm;^@TEtIgzisV+KvcQ#+%&Pm|(c46gn5v*Ib>_J5tHqsie_& z?F`6$8q8mnJi)3W9bG+M5hiBWw^=5wTq6w{+vf<^SlhWENILvt4^SCr7AziG0)p4r zee?XusOA&x=UsMI#z#Fr8vY$b7G4gQ%Beoy;}!U=i^oq-@fqW-oTfQAPfXDn%YE!( zWTr{JC;2m1CT`Ma?<>KuzBA{&Yb;W0+)$SCnUf@6#bc6=yMe98RVO-Pt$omTQYt2t zi8!v^8bOWZ$7*x^+kt_CQ-yrd>vJw1+ZdO@uU=(MN~r_yUPhW0wa;Ad+EqpX8WYA=7kl9gD zcgh($B{i59s7}Smmfe>Y5l`I6KJnYkO?6A@+i9`GpJ3~MlxBh`dL&uGCAq`$>Xy%4 zmVHn)#my$(^C0;#X9nlJ46%C}D$lro`KbfQHj)PrJ_`mW!hi6WyjPw_)E(`?g@uJT zJN-AVlt70>Ci8Z_DjDho1g(p}m2B%zW*;$h^j68Y6NPY9q$7a2X!#*mLhDnl^kM8V44GZ0gOGBhN7YCJ zQLwCK8`2EPHY{ytu|g7GVD(WPdNe{06`qiZ@|S;=4eSD`y(AsS)J00##XZan&g_WK z{N$!*CZBCNERZZGvES{9v=QjrqfkIwMTht+w0Tc_Hf{<9dsTp+m}pUJSL%sft_GX9 zlUzZ{m5GB}@+FE^2el69fnvkN?W!r{B&M+Y2wf&ML^4`4%gU2Fc&?jy0j?R6$ydqRFKs_jZSwY!o ze?rw<75sy`XQQs>hLPz6VpZK3F(Fq!ulhl)--Io!x~+57IFA^h~K z`Pdg8&YzcgM0cMYPIhu*CpAABe_MV({4ZYskAX>kASBVaDTFY>9b>hVylwp#ElXdk z#J}12lDthFcj!Q6)eF_W=A^Z;ac(Hf;42RGU1*GydLuA5PiH*w!v5SGq68otpWr-J zTXjqViKLEdv{=I<3pqTOdz6!z3xlw&3@qE9Q-*jp7n!xBEa-#WO;OWf6oi7t`u@*H za)8M>vpvrUZ5sTu5$StS^XdBn1(oF|!+#G*8{X_>x7UB=fnkkq`QeZOdX^#Uo>4GI zZW}uocThhx@iI$ETpN72<8{P+=;D%dV0i7?wRu9}n1Z4(_^WZ1VqFX&Q*3ZQoWSoC zMeTHiElTW+&*qFufefApGGF0CY)W#vRvz?$nOgCY0r2R3jDdf%bl$21)`+F>7HP;( zGPwvbT)k`P-4b1Ujqs{Wqf_+aq;==?3tZ{OvZf9XmwjXF>+6S^#etD^Zl36ir3t<) z22Y>nE#uES3~qiGOqN8f^>X*JXOj~ghN(E37vlD6+|@qS z{S@IGNt!z#zRk_&yA zc%%Akd1hl?H2##y$yEA_%MgPapK8@7!wiTLp--m|Oe-!Hba^O~+orDb^|5X!>!@?o z!j`$dS~~$%z)S%ofq*pXCExoxrQiUIt;9jc{3A$g80&D_CYhV%VW4e7^_!U6c9ri~ zKAXpfv3BBrH0COgdAc6JJOfB)zQT+ac1(s!Cj&rP2!W(U$%H}y_&SzT2nNhu2S`2C z6qE%#U7{2isFS0KBzG*!{YH=bDyz}ET02x)UB+ovs7q1TsM#1{+x7y6D%Dt@vT;8z z?( zR|hJ_T5_u0#k_PF-@NjFx_b|xCb#cV)N<4#il`Jt1yn$!EA`M(n)DI^(xrqJn)DWe z1(2>(0qGD*2uPO_f+8TH6KZJEJ4lC6a=!%6`JexLckcY|EAwXF9(7E_CrM86OHOoFWRi+p!=V4}UtzD`NUep#kY4=a@qTs9WJ)zlTArR{za zbq71D&nvz30yNNd^_*L-$8xb9w_>YK@c{cj`&nN(sX(^oV6|p@oOm=&7+Z*Wx%igD zK7MSeGh({Rnt(+}`g;38C+vPTzL1F@5JlYROAo!RaL_Y({Yc&^N^8|2p56YjkX1}o#HHVMbj(wAO-dv40cFgla*mKaPr;RRQCSYt5eru1&wqAi zA6+oi(vVW!A5~#(?5FDFiIGp>ICf2-b11yn<;xtq)g!JNDd9tDV4Remhos1!8o~~0 zL%Quaf}T1Vsk&nT-)XRErNtKN)8g*0?4 zCH_h~AJji`sZzNno+wW}js*o|O^N7XCFqca>DuH(7ZTF_`v(Bx?&{KTTOQQ(*;($c zS9_JewHaTanv@^OVuIA3D7@0WT012a;C3-CHFZ2&eR^I9<<;8Tl$e#9!DUi4B&MxV z#A_NEl&bM`#mrp6rOd%a^?lY=RyBt4&yY+BrHc&&o1qT-e2sMH%J|$2_;e#@b4RnV z5@Z-Xp_(wf$kE^}E~!dwwiFQ4FWq?gHlhv=1J!V39Sv`~x~gQ|54+m< zeLLE*LkRq%_xMw?D8bruR216rMg%LNwzBDQ}XOuk05x zPyu|0T~n?d&P22d{sy%sQ9O}~A@%0R?BdFPjqeH&JDb?hlE z4Q>A3vxHA>qoSN!qZf0slxxX0IWkU>Y3+g5Z_ZOjHB7t*{t%@?EuQ!9S~T%T^x6bNNy>4dgOqwZ@r#6(SGKk< zmbp2P;#{gEB8f9{sp$%XMddIr{QhY4*3N8ax%t5yVNHrPPr?3$TfWA*=0dJh(JUr4 zsFe%y3}-seyUD*Zn_SIJC{?bh1kMkG{J%k{+78= z*MjTK5d+H!inm5ol>7c+Dgrq|y1_d;!;YL{HPz^}qM{=Cu$$?(0Y^XlV0gSYN1aJd zK5ik9$yE!IQ<%R|zTi{^j#3bQo=@|98+{1laNDSfrdSv=5;E8=)>W=y}v~?#c@xm*m zPon#F{^T&js4t^nzRoY;1$SgV7Kg`?Q0K2aIc2UXh6Ky|APf41h90ipu{Vp0jg3`O zRJ6I8rWlo)nhK=bTv60gQ3*5lnxRD>ZnP5CBjH>@4e1R*RLC!1o*y3{FOFj{*Qu$C z)ckcCgawSLn`HX}WE+{dK}`+=LvuBkHO!2_@p(p}(Xk~vBX0Gdf;P)A1rtJ<=AU2N zsvCYN;wBn6zuImnln!zl-ISK<6RL8x#1zcYSs({%#2|Xtb16 zetv#`3fYKlSdpL4kt}MTf4G&{;U$M&?=t1?_T6cN`kBNRSSR>kLLA%{Eth)ZaMpmq8=hTeMdXbGALz3eJz4 z>1wRWhKF3TL2TnL%Uw<`$PtF4yi9b0OwPeo`+v1v-Ckir4a>l0xCYa?P3W*gu(G&1 zLA}Qew^P!WBF$)Vj}4O4o*nQH<@4=}9H7;B zFHyg*hHZsu(luF&B6LN`$h_4SrDXb|Lfv#}Dga*A$U0tO)<^{~m=PRIapGG-5XNot zxxj?Fk=}4W`vatXwJq$*&wIz#Le0VTSWu<>)OHY5GxMzK7ACiR?rTISIE$f4A z8NhIJb_PQdgZJeZRtW&tYWcJK=l4|hj4{Nd#E?Fw>Vfpqi2B8|9i@y(eF(<08H=l> zfH1Q?BgEijOqd}_wgdUVPGN~uh;`sNNvo3L7zIZ)<7PQ~^tojLfoW}PxwLb!Ey2aH zt3a>Rb+cW#@>@Y7V#!D`@ll!(gPRR*!*!+~+G!ohh#JsAXX+v%b)FKP&h>4nl?BP$ z{DNf)`g|25*zH?T?qDNdpI7Pc0kbDMn@Gw1K_7MbT!4JOx|NJ2gx^GV?p0#R9>Wo2 z)gnA3R7-YlacP~BLU;U&Y2qWRSCYrBuO4hUSWxmq)66E%_m{toVHr0r4zF;&90Jo( zCJHDku4{(OTR*FN6*A9+)Wpp#9QADqK&&m^Yzq7(Wn}rw3EB+1C4? z=9XXuetOCpgjG+~1cPbGF+KiEI%3F3Utiy8519jiyfXEDRm;A8dx{3imW7|U29(i1 z$brpwfzcdR$R4M9FnDG?7rPQC`wqcx)nN?~UQ||neIWYs!PY`1V2=L6nnR8mw$}lJ zLxm~Hyo;`{AG2FFpJ=@@T&YVuKALf^(guQ%E73l?tBFGPiClEb%4#H=(@C?|4h^)9 z5`Yn0-Bw5RXlZFhUrxRi^;j(!Kb#fjmLRzePTJ2}@{*x5hCJ~)g@$<<8TXIo;Kw|<|vz1PA@6j8>gGjIY(@i+;LRZ)Y^1&yeUG)JcX-Ipz-8wR5bdGZ*WAI5Ub35_7 zkU#myyQ3t?$EUOlAGpNj4d3xbaEUuz3*-{c-)B{~CsT&j4P@{<60bcLV!`bql^63; z0upUOHv??`8tkESs#4o2xhwB*xsM|`j@=EGR7-2Mht^OUO;%M{k8=+f3l9Ko<4cC4 z==gC^ne!sQzWe77a~TijSI#j*i1)$j2!~jtdyOgR*>m;L?H_-HV}w=aU~Yy%Wp)HR z*tO&gVTAU0Dd)uA2DtsX1Fp14MWc*Jo3i}c5NjZ6u*nc<^R?g!#b;x?-TM&uuUGxB z6lAR#=$_`|5V%u;{3xv<%#qv|zrNlY1r%O-)T+` zUR1i}{A!m3JpI2zpoLXt;P3Otd4^asi5lT4JMZlxU|3^}Y^QIPGCy2Ox#64szWUaT zLQLg=_Vm2Q5KGlgEpiBBkpAuGlUqWTiQmG_@l`f&X4TrkV~!@hr80?$+5*g_+IsLk z6x|V~F7a?_fOgb0mCx7<+>*z`{qf-A{oBgLhfm!)T{dZE3Oo{{eaeMa_=lM8WI4$= zKik-j?4GXF65y)O{#I)UTkp2b#?;B)`@pUgW}PsA_%;*e)u?Gm{e?AtDXi_mvMrrm z*T^_J-_Aq5as1$YO8B4e^~#iopgk6E+j?@Vf3+ZfDdacEP9@1;hwH%5INt6{tEbS9 z9SUZ-Z}Q-{MM8zy*@EJEq{4K$bi+%=~KgJ=c0Oy6Y~AC}W`>!r`< z>V-&{Y!CFA{Q>>?Q7`23e4Z0rk)nU^;z3MwHF!vlnRW|5xon> zPiac{=rTV^GX;6Ne)=YpbvUYA#@(6z{axIjm)qrKl97UOr6VysIPH-sfsvy0x-{<`{xFcU(Ff4t+>s1O8BEotCE3GA=~Lxn_0-oBZ5 zZmsgr>bq;pj2p&izs9wTSDv3K7-Zp}<)gsi2g-M!77@so8zb=#Yl{ApEYc;}iFkup ztMG2>$nOHy$yR?*^-xD{#U@mOMH+c#xZ&~*sn0W#*^sHip_2V!rWd{cazow`gfDye zu9q+}#pW0EEH)RuNh)7%r1$n5{F9O>S3(E-QsiJV&>?WN5v+~!bquy7nuP^6v0Cr( zYQ+}gCIn?btTi4jwAV77lBTia`Me=^Aq*~XBmW1A8<#+bA16XwXKbfBw5pur=GP}W zt%t@E6CVuRaaiyz#?-WQ=G5*(^(%elXUgPy&sz)bKKdMeCXAnNR5cG7)t1q&zn?DS zAtCVW-i<#kR4Vf*=r_>MyR=hlv=dyK7UGNG1+jVjTYJC$XryGwp)YdOG4&cv0PpD~ zT8gi~o1B!NpHR%x-!2o{@^FTe1t4RVbX5+IeT`tKnMlZF%}01c5HFO7WC^xgI&cPG7{aiM03L zykRcjVVuf?>`=_EvZP}FFGbE6V!I@_kL}yZ$tIg__xQ`iQ$yHL?fG2#R(q_T>BBC0S|9{L z+TeGL#6WP=q+|Bal#t86j%8}o>lx8Blah!ixY7jLhrO#@*DJdZUhgrp2Ff!9$xTSo zR;cI&J-A1m_{H<*s%1_4^ z&0& zHr^9dw>tv=nvugAqQOm=PN3*HPhZY$7!#5^zOoK26qXzS#R zB#h(nQ`%AHpFOKjDCdN)h@gaq4tcIiR~e}uh6zZ>M)MqwQB_IFt#vXLUNuk;C*p$e zwN-?a!Hm^`T*-0UAs0-~vhFyzDa0ejoGHWfht~(X#6fs#i4) zQGEx>ig<|mpv7M5i9D{gO+B+pU-(M3rCN=#r($J1|3LjhGLG)^qy<2kacN(^0A!z( zTeV~q!uVMGh$j;Q`I&4k=rVFQV4NoH$GrwO-OqX7UOPgRoa1O(Oz||%F8*iEd=q#9 z)MI?i^87|bf?WtBzuZ*NTR{S&ZJ7B%vwl2N!T0fwo@pN+4wf%TW=vix>!V54v-qLi zSEwDB6Q6FV?V6_HYsMmzi*t=`PH&EDz2lcXuGsfLalM^CSZ%OA#82jDXohN%&WvW` z_a`XO`(xMS#w2prw|4+EcG*6sCQE*W)b2_5N~cG*<5^m`h{imXK<4iIGr&QjrC+uC z{*KJ+SY0Ri2_j?W`$@xLRHX!Xa4SIkYeXis!*Go$9AC3mHtsFBalzY7a zZ=h}j?h}SsaBe$ZiV?Mju_L{@n!<`v_Zr3@R2lK9f;J*NRN@V1qS`H^P5*efsF2Pl zHJo!@*CgRWNY-j3yS~r%96D(F=JovAv*R3l3WDk5DQJ$v+4pouW^d{Gq;k4-d2r%l z+UR?fQb~5UEYj>n#zoL1``lTH9*~ft^9rs+qy_uKr6uN`LGQEnBrzRL!PD>bs&&QG z-F19>tH5$~$Bz>abwUEqB{|b9<04iJV)u(0TyGFwuwmHsqmB|cCUiC7c~i1ajeR0*~@EK~Jd^9j$23b#ol z2B)*pVRAf)TJ{HzREyA(x{d@)mepDosn$Rm>Tf!gb&|EBH(3SMHE3H55MBl6l^pqi zE#GqReb2qZ=tslhawJJhC$hC2+!3EPpN0~cLVnRsqN`iyTy^Ez= zO+~kULCqd5T64N!26x+n8W z6HW7LOc%>3dVljLeSlP#>W9@w32Q4L(_-XYN=@E;oA7 zA;M|gpCl#*?OkI6yRO`_sYp>$(kr|G?N{J05E@zx2gPC3uV5Ypq~iEKOUj(S2BT%_ z27TziFH!1UTE|G8Id~D?Rby`3a|5b6ujls)_eovaB;2G-Y2C+TA(TqjdWRY*_)C}9 zrsJKmv1`HpbNgFegWCzCG(V4J#dZg3sS1Pwwic0BJ%F=Ru=c5d%qx%ZzTOMfoS)6D z0u=rbuCi6rAt8s^GdG2ZyPF_rnly3aZc_PzWAY^qsf=)_3uGiIWABkn;5QmE%S7!h zPV|muq5L+J_gA+q66HBtpQuK_<-zke)0Rau?BH$yW- zGdgGY-12Rswe&~jp;QO3=}|sIrHq*5P{C6JlTk3J9kK+Rj}_+l52au+HCLT2)M<*a zWjz?>@NQlgUR6_gSerrZTx+F(?$ALJ&02DhvKP`sXVGWe_GgTZ(&Ff&aQ;Jkia-C< zke2qhJC=VlrF1DG@7tK(SHJkj1>-}T=l6y!1O)HehV>zSjlJ3Wz6tMotsrL8d?EGW zS{4hEMKB$bUXHetS5E8W#&%lXydud!(bCma)Nk(WBk{k9o5KTwK`ajUyRpT^xh<#VTIOsLYbM{{ZApvl$LJaqR-e4)0ca8}Z<>qa5qvMct#?AJGg7s? zOyu9O0BOHgPjejj2GF4f)0R8O`bNg#X&K(akJE@6qquTydS0ON^IXuhlSs^NOAU5# zHMj8L(yhC5$0)pe*S+{9agXt$r0a0OCugxzIvbA0I!#T6D<4{!>nXq(#z4OQq>wdQeRRLB zG_TygSlPylkE@Hs`cL2-A**f^cHY4=9z)JX~-9)Q>^FD z7jOHiT%z;PZOZr9o|_vRFyxp1aCTB)vs{4|?t@v-XjUx=ePCucl)M^i8;Ue);L`n?I5*w*gr!xCqFXfedrswm9Q#_GAMq+iqMV{ zl4-y#j!sBYZ#C~T)B4p#e^C`NMEIVyvl`$;=KPj0_RDV{z83tMu19G2_{7bgC+UqT zw5W&5xUbWs$U)j#_Hxgj2_-lyHA{@CFH53m5f%TyHo4m7aPpjz-4>USoP@v=b($P@ z^F}1LV!M~R6v1x0g2e08gu+e2iA7!%dPjA9@oimLN0lLmNZekYyxXaR`9E0cc)K$c z%>|_Fvgm4sChQE_2+VFwg)xnzvD;%c`jrA;M?Z6={!?<19}zd2dkf?S#61uTsUK!< z6+=Hgh#bz@Z;jsGqepuWlgT@1KL>Z3 z+Uv32TH5<7o6m6mzmlN%5DOVc+&PhY4(&SkI0{X2K=k^!>`{; z82O2(%>bTU-TKt0*5>B>iuw5)Wg9A_M?V*HfvfOzG*7B$*=qm}56QTSqX1F`g;RxZ zBO5c^b)Zo7soImtq*3zW4Zt&GN;_4*jl@oU=sq@)kadZ-!ChkRDp|qyEJL;1w==NA0WYP+k3h*NlRiYIV9ggJQNOmrx8rTN4KTdt@IWP-4O6hKa9hf} z@dH=8vLEQc2N2e~uI^V1mXnq$y}4PFjustx@uNA1(l>x){Pl00o|?+MB7Ac|*8f3y zfRaF|tO4I9jR2RNV(~n<#iMdk_mLayW7VqBQfK{IDl^)rJj zG3@EeFB&#uZ!9(h*iclXQ(s@Pu))3ZWZlk)C9zX~Xj0!IvlKEi3p&+Gjtend(kEJl z(ef}9J))txb=PM_$md5Z z&t+sdOUo~+2&3qUt*DyjF#fwAV5$3yf>ajXdte2Hy>P@?9ok(vr{ zW?rU<9^QE{Mvd0?XhV?WPX@$W;|iBfD?t`$JKTdl_hMuqba&&jJv%h-(m%00_2q@z zG6FyZ+6#nn0!`l0rPN;a$<*!=^Rt4mRSQ;Ek(x+!wDRpa@hrcW{9C3okJ?rJG$cd? zCPrkU1-_-ASdku|P_AV2Jb9`-^^*V_LrDZ*at1uNSblxGq`XS29S}1mxzHQ=9e>Nd z^BgDjHAtWREB+f*_v{Rd{4>2%++ioh)X6`rzx1E+f`4ab)c(5!oAJLgGfwgT9}Mz> zpNEZDH^U5K9V+wl#ZiJ})$og*q$FK=sXnBv5^Fmn9739hlSDDVsp-@|lN=w}!`w`v za*UDMfDeNfLAL4Zeg`-Z@Fkcr60e8as7Z%G)yW*KOGRH?_MmPgrg=&eO#78odCq@H z0ib7#;(2+g%<>%ax~Z%?ylR1g3F({ zcfGasHqp&1g|;hsWIz>aga%m8Cg26Te%A=Z_;S%DSdE}gJRSA4w#8RbJu7tkCsQzr zZO&8@K7p;_6281w8u0(=ad8Wh({0Kk@;Uq@$exUWH*`qZXjx7&6CjUdNUbmMuN0la zbN>eF?_z!A7ylQy>)DYJ(EZ@(GiChKobDSJKOuI^Hp+za4MgO}L-H zElGCPoJ^M&@B7YjNl=Tou2`B^60a+S9EfJywHaj#qS^WJob}3|plKs!!Q*2!?7_6l zjBs2P38VFL-y2yU^6;dW=a3O>4Y8M$t+`t2TC}0=uBC19`LbeE`g>!v#erqf1}2@3 zymQFo#n0idbYml0(oY5K-F6!ei%6zPa|C<7{?{=UpJN17|Ecs5!48Vj#I$Fc0;Zj) zbkaBjVWH*fBehABUSiG`4YS-BaHzExKbvS?ULuR#ThFLi?y?!-^kFa8rF#pa98I6@&2$ z7&9!_+wV`gCFK6PX`K4OBH8&`_5faE0#zpi+vyzM{dZx^hbOc7Qeiaw@9I)W|9^q= zze@%GAt&Izm7&KK7k7tT-iN%{FI+47ws)DBLb*zQtgM&yz9)U#HRTp6qi?6oZ zZUey=<<%(WC#2u_FKN^yu1V-XLwTf5I1h#XR#Ngfe!^vr zzX?6@6&cLlU4Zjid%?Z6+3Z6zXmmB}f69seAbVb&7;Iv%sz!~hasUh=-?at|mOEi0 z_^H3cLG=G3F{1iyMBl&ApN5*ui6Sk9-2DGT+Wv1c+JA_R{|hYS|BGSv2QGDwu<8@P zMSJ@iS47kp`0)J=om1AK`a-fUHD$z1;q8$x&0DzBGF!L{sr@dYrP@DhL(Cc5dtA(; z%8#wv&7o<8%M3u6=!X)6Brh(j^#1l#)Q83AdTQ8 zhc6$|KbeNY*`j_*6eYsD{#MC8;r}e)%XJ{qe9Gp-hX{yG z2d{D8diSMm{NS_wemi%dj&W~ZLdY{enU-QifBUEOuwkt-U#%=(h%IqrA+hSn${T8$ zQGF{aVnxFnuLQAxp+XGIVg+nUqR#)J1QCll#`8h!I&d$U4c{=P6yzLpt78rz)fBdK zPOaA5Mxsg`11a5R!zyTr($wYuxh(r^mCrs=%wSrQdNjLCK(6XQ?v&$C#K-CR^A*?!0u1k@MH3D;&=TdSiUI z=N)rjIHprXe4%-KsK+WLs8M$zZeeb!w(zliiJ7PnQTZQ7{;#5pM!bk<Jml(k~G@k%aXCVSh9B0q&kM6Wq=0ET1YGn~2JLU7nY?hrKc?Y6^sVEQzzawe^3> zE3yUILuD&v4m^?wl?$6&@q%z|=e4KY%0Pf`&4gFP>QLS!Qxiw}fY+Ha^n&;55QJlI z-GZAq&(O}mf!bfO<8FH?xNw}H;BHN%Qh=_e9%o^AZgJZ`M}&#TFQa+g|Al;Q?#c&v z_8b+c|LenAZz0015|1r#0FHh0evXr0Y@VA!X3Umnt+(q%%}6*?_V>@%k6@J`dA6jj z&Zo4W9&yQTN*z zparG2toA$CTGNybx=n+ zR!SS$DVD880CxsY*n6+m6?hv^HeP;2E8rbBo2s8N&L)*#cI7B$Ys-Rll-_xQTIY*v z?Kce#YyZVKuF{}fWwyLQoBWIq^!xghE%qtYdqZM)R;z?hv6Q^(QPYK89%?O>%0GtQ zx7!Y&jO=sP=-Tb%D z)}TZ`Mp+D=g?PkU6T9WVaWl(4e))mJxJCu)Ud>XfVxYr~ljm1@<8#5dXJ*JTQr9f? zdN$^LK>*W=MlmjCQ#y{WLoaJPD7)~DS>LvFAf450>w*yYQ*7bZ9ht{(R^K1pAAmi+ zQ7iusunkM%E_GXgQgdb(%7Z`PQ-us7rG9;6OieD#gdwbL7Ol5G4=iu~vIxI2f)Jq^ zFZ$qE74x2^sIZ*X^wrtztr5NI;kI5Es>_;~(~%rAnZ67~0v5Y&HbJpdHFXcqH|4kO^Alpvj+A1PxklC2%Nc#Y96k|EP&@!BW zvqF4mq}=bD&was&;ufLyIO=-QT`01R6eN1ma&g}OQnI?_%YP=4#Rl@z`tn!*cSWuL z>~POEf>JKK>$Zqb>0}S1@;TI@s$?CLCn+*gZlyqneg##czaM^|G%G`Oa>y2S!M|i6 zX0VsIZGa8waXsz&dhw5DGfhCrao0f}I#`e6L1e%PMAK1q1M!YC>m9F=F)+=a7OS{!3q@%cvR!yDg<&OCuf9z-_(# zzH2XQkorCxq#2w@gI)-CN`c=h9WS6A&eD58$(Np;athGrl0~}K$oY|{;H?X&l0B-w zW!`FcZb1!jtO5|v7EZU ze-q5gpeq**W023pFeCYHYp^uCfxGPn;;LuQbV%CA&2pHNN`~!AiX7Fj`?a~M)?t$D z`WNRIe1spu`kD!5bJy6prA%XZ~!V-=EK3jZ17Ij!eH?Jv=sdDCXX1a%!ND2*WmnOaIpf3+e`$o zJ9u#u2J8}<;qSp9zL%3R-lT2eFDf?8j9;=DE@rM#I?MO#bG(q19ris&oTr^T60%w7 z++!sj$SCzQ7IiyRbz!)Yojo3>nIm#yiE)}=>c58CHC1I_8T-(`v%z^|d{|gDc4xrY zNAZAv=up;&zzBCG&=U6`CyRO()#R=Ib7lTuu7SU-?1mTEF{P-d9KX`lA!);I9-mjJ2MUe)W_ID*} z_h|ecbKDtT<+%}1HCvd&zm=I|@eBkaY)h;+A6e=%CTy_98BT6wp}G&Qo4#i^RzqiuAYFIJEA{N4B^BGu#$Znh$GKy+i0)bH z+lAYl(?^q(T4|WuTC88qnYtojEyy(4r`^y6s$ozHU{KPxeDMUJ9;xHT z1r-ZhuJ}Sa^2L(eJMnxPa5RN;2H4kO8+vR|go2>c2AdkK~MzEEhlR z|0-h!mMJaFN-8_+>L{_KLbeQ@I^}uc-EZcsA@kb`-&DR4r?`ky~b9;8P0rEG?9KA2-l{wV})#DMkQ_^B1e_SoO_|Z(aiW z@lPpC*LY@L{CthXRDTJ%>B(|#b0A9f%d?pQ#W>SV2g$yDoit!|4Ns_bqnXUD&_?vI zCxF`CmNUYw0krjX6cXCj4Cpa3~awD1O#YEUzO^snioGLVR7rOTd=jFR-P;3 z(2UiGBi7ZO0)UJ3F0P9-=Z{*v1|*GkOTaDkH@KW~H~v;;|3k>h{swvOJ#y_|pm)k^ zzwJD>|C>nof3PcWHCbetMc)mc_Qp-7whnB~qvOrn&P%LJts@uo#{<5Oe_Yu$iZA6g zNaW_N4SzDWG7ju5pru`~UM>-ft%NLmYIRv^+Nisfy(Kj(BjQFVFocaM+VIpJTgAf( zO?Fj3`wgY)Y=~>@JFTBvZb0;3&M!r7M{ruEYN;z8xeAO)`O2~W-?B^?NB2U8mSk9RA9Ip!K$brA~uY@;3uJ|h5Z?w zWBE%=bV@is<}0;Sw3tI;;xTWT#ZonGLgkAmbW8`ah91tCfg=lrDhYdrwPiceltWDi z^w9?albvi|tBv>536y%9KP#4xE-vkh0lNrju2gQr9mO7)SM);1VV}G$M26wRaV$0m zGuSrFV$b3B-logCn{vr)f^!ey*{qSX7ah^pK*!KUM?RuTJQQBV44pMf;dC&F+jrq< zAHo%NhUB z+f*FxjGvyoL}z&fqOx2Sq2}9oU1WK*si-e()%beJ`B;GsGD06eiwhO5E!j5nGSGq9 zsC6((-J6mWW!q#L+)>!%tSrw#{q+bgXlZ?(X)NX|6Jsj9i{*4#rjlazHW&jEF<489 z{*d{gcEho4rs`;JMG-qhsE_g`L~Zh~hAF%Bp^&BJ+++}*f?r?se4sC4z;v`*;CTBZ zh7jtv;AshQ_~>DR1@Htv*x^oq_0EoX<4WwV7FSk`y9zMF=592P4T#MCq zhaADEW{=`WqL@a+c;^`VmIO1IoN6b7obhla#)Fnwm}$O*!NAdd2Db3d&&>Fe?IDZ7 zEwBAmtWU`c+Y-z55IgVVc34p|#`n>Z!GY!Ax{q#pEWCzU>>#|oFLbqqSiOY(^lN&9 z%Xm1dC&+3BJ|@V_cKKFn(kk(#knbE5pC~{^ePfyBk4~Mse1UXFKx+F6H?#BT04APk zTv)19if?)bbpuZ9owXy@hmE+e2|MGaE&3Q*OX_AnR?JQ{Sz#X!mw0}SD!>m$Rh&(M z>0pcbZ$KI{r8)xL#@p?0P_|#3YsZ#RCzZIYd}~8P4FXNJV-H`fOIZx^EAxA-GZ*ob z>U{u=rn5Wy8Y;6VZoASb@nz-%i|Jg&Do;&iwInfNT!eDP@w|eUZDIj(`nOfrJqRZ@CWu%~;zb>+6 z!&6_0*w2Q2I*3jhrl5PFLCP>( zQlOOgcG7XIy`IZ>(^iT&Gp^~f3emi2HX~I2L6VTw(xnUg>?d2#+n?|$SO^^hc9HtP z+Q`obk;P>7Bw7J@A7-i*u}byb=;v^L#g|2SmitGERU+SBuHVAZQ?0^fjJ2Eic*-3~7Pz11!V6*5 zTRu4+qCLf!qX)`o`3V{7mRsjaOJ zStyC&RQnclsVea)I`ks!O-Jh*>i4OC{D-TJtdE|}3pLF0h%YfQTaPgR4q$&Sgx#g2 z<0-3|!hI0`mkOrxRgajnoQxK=ms=Jxl@f@*X2spVH@SZLi@V{9x+o)t*QC1gQ|Rg! zHe3W}_S8CLf3JLLY}x~g{W;ZaiS6yv^P8(!wu83!i(wLjA}~VZMgl@dxI{r=qiDGc zdwT~5yUKa?27a5qA`OY-($B;`nt{~*TEiOd@6EOy3{@=GetwVv#+M!&D3=@$hL*gT ztOLW)Nj(@i=CYKF-|82;`Bvo56I>o^_pYdv%}#Z;7T}@&~^- znJg_eg(Yge?c~0F)#18(p*}$Mv(q~7(fSHYvjuupA`EW=Ej8Ah{l@!KmKC+wtqvME zODDT^E1bCmca>eTSq)O`$9HSOT7FFia#fkTRaCIDS6a4asxOOSR>B`zEy0B6@{{6b z(t&es{VBAfz%Y>&X~4ht{oaB}sdJJ2RLMx4!wbtc#?-I8Xq{UhK~j7y!)pdnGm#Yt zX2oU~bqgG5#nJWp2?R#AGCc;Y|-AEO`$J|*f z*(y|#g^J4ZvPQssZ4=jO#HX4J6NlmMzur_OnmE?7v^R%!&i4csz$bv?eQ=ESd3$qs zXN-9TmG_D`EskL`4(!wfyP z>FGy$c{AA}6?Kf`71`9W5Ml);Cj>fZ_U4mASX%XSM^sx56O{r-1 zaHLCL0yUrWq@W%_W$e4{XRU~2Ph&Ms$&X$)KE=MA>g;SBC{kPa)H&UxY^xIs$RmxD z5U=I+z676+tgt(R_m~xuW09+l?Zr2Ylaeg2w3vq;OH#|L{&*{*yn)LZ9((bf?87=W zKSq+n_16?nAXlHB<@fsrhf_bVlEr4=!T&oFn?V0kQ>>B|mRF}FM@!Mls@xa{jOE#S z>lTF#7aeF3tWrErzH-Dn3Xp17HeQz>Q1M!-XGaA-y_ejVPZG^tl_@C$;!~l3$h_sH zUgS;k7W;cVi&rQk4jQTD_bunSy3g1H63Q4jyfH9KJC`7-sCWQT6zbRjFFC z7%b6~qGY9oT7|6w@1_kemGF0S#H!7aF4vR51CMUy_i1xyu!oDF@BQT zmenM0!{FLQ?4Don-Nl1)&d zA^*sqF|&A#+S6!;!E5@soE<)m^>x@`UocQ{Zkgko{Fb$~s8g|2%yraH*ph&mB{Gg` z77YQ)fxiHdLoS1^V$eM{9~|UC4^Nb%Yk@%J7+(8i-tV&8z>%AbI8lr#6`F#t4JOrpEDYGp)5z#lcxq@8D8siOxPc zQbua6nDOKp%>IjG_gVI$q19_A6B&yjD_xD@eghQ;s&K^NooqJUJX+DK*g z)XoLxQLsnOKJ~y{R%~iZJQMfYZmA4nJPMAUe8ax73kG)Wk&n$e{IC(s{kco3hnXnQ zb#Kc(bh5ckeK0+KtHExpgmZ>ENlQsfEy|2Nd-qtc$>gz=439T!M!Ce1^y-(T0s6;2 z?i~!9r%#_~SN@gL?<1^5mfs|Z&g32rIC8f|HOx+DXs)&L*6qF#-?yUXpL)yOv-=tN z3wGqi>=yf$mNB1Izm<}HmgTq3HkK$OmXRHke#4JPnC8YTg4geUE2m;a%10>C##>Fa zV9W&|3Ga-`ns7juJtfg_5B7khZ+py;^lblCRQlLSXc=Im3I9ix{@>i~zZLDMtkP0E zxRRt^JM&5$0K;{m-G-FPn#>+ko}-E;NqhZ(YFqrgV3#y`&!lRuj9714oL)I8^mBel z6xkQ!9Zb?y(h|lpsAdj+Y$T2AE0Kf77vjQhk7j z^E&{y99S#<@^4OLD=KEc3G5(OslS<#90>(!7xI*4m7|oh`2hwo1j-}ny3&=s|9@Ak f|IgSBCsJKG|MFqCvlGo(nVv~+iu0@5&))Br;dFfjAZ zz2E2Ax8J{D?|uFPT%Rk>IAR^^Sj4{3QX#;l!9_zuBT#*#i;#kL^{-*X;GVs=Qv-S3~@U%g*b9HmE;q|iew6SsZvUl@_Vsy)* zZsPstCIwF$3vUNESEhFkE;eY&o(@a`5=?3qu1rD#LZVE90+Is4l7d1^@|sKvy6=YR ze_x}aF`=m{%D?l^J6a3$PP6HPA^lQC516?kG1ZLJj7oGQ)Qb9;bjjZ;Q?JzyW7|UJ zmR#MXW#q_03W9S1?$(5Gi*M5cKd^qO7&ssM_gG*_hqX9as0~^Y?}!gRa^aPc*I6zr zFVQ@Q7k_=J`%I&#@d8klQ92p4wh*-DcT!RbyzAssU?Rd{V)<*#U{ePMp#61$g(WM# zPk{KZaZpbd6&v&KcwJa3prQEJxZ{bm`1=hrc3k+Rf4v|@R^1}|-{)guE)8+}|9W1r zWLk%cD)v2|2kE5Gaj-{dBmMNuWB!T3l@5mPE!J7l$pzDO zgF{LQLq;_?NEM>+nI$9)!~Wi{5yF?rY?7SX1xKDlyD<2-nwoCsB10o1TG!@w^#?dm zw(EA6)|hX>hC2SQuf%xQFg)r1{{4F!ool`{AeB(|CbT9#$vCs_2pkfcEPEG|las^O zNQn99oxZ+VX5HV9zJ?v92uy6BdAC)yv}F5%A4n4u6VVh^RD?aY6r(usZ(YH>g}^?_ z$B(nTkEZ32c~wTuTUP<-RH8hfC!`!i&LcxVYL&_bd{1ow#6*XNsF{h{MhjLt8vGenKKv~Ad z#>R%GrkNTuxm8y&iulLzgMBR?Rsadx@-BYQbrlr5tf5~HH9SXfz~ z5D?h>{46h?&o?$U<}UW)jS)ChZi92e+4iNk9Liv05);{lgh)bBci#si{W7fnHsbUC zU)vVh(|cBs`6|>zg70QQ;`v(7xw*OcYPL4&bUsL%;xsgifD+gS@ON@76ddf`)AI5( zl$0K~t~7fMYlo;I5f4(A(C^KnwZcJtEnxS zz0>e$BsH5uyMO4zkuZ!csv<80Ya2GErF_#TMMHhBcIDXGdiB(VeWCU43NlO}w)!%C za;VN~_|)Hs>zj11*n7?{!agl0d&k{)jA)Pf-@?lf_7^YGpBx-eX(#I{PEG5Y3~UOC zF@Pm#Cqz?KN;d>xBenm!&<>+PaSGGcX2>kNd(IMD6D+|lVfubi=_)(vOq78Z2p31NA$U*BkoOF%4w}YwW=mNTZz6gM~dl``e4(0>C-+}={?sU36`6X z@lm?!7k|aJ>PiMpzy~9vGkR(#WyR9j;slIlML=3=p!fn>80sNBPIj$oNtelSkbuBL{ z3M)hcPyB{&#?3^DNJ;yB7D~5W?RV@AAJ=psDA;X=0jFldt;qFuYC#?lrGDqox3;_b zp@pL}oj;Y#h&ir%S;oTPJ1((RcMMc~yM=3Y_==71$!Y1SQm*7qEYkUIZIh^WNk-1S zEq+~?uKfLB=DG7~yPgLu9P$iWDL>x;8)h%11#tH=*lS{)OCy%p)bz% z0ZTI!UZ>|&wf2ly5)^0k6ue+32~akBV#ss08xAZmh%->gq;$^`|p#djGDy&-lG_y!!ZM z?zDi(2O~1FHi{5 zxQyLI!+!SV=JowlxIk#?+77(~Q_zc{d7tQ-UYyJu=+Y(nC=AWE9yCYH+V&@}T?Hf3X#RBI8o1kC8xhUv zBPi{emD*f?`6y9sbmi^XeQa^9$x-d9lCyDBHoMVteqU`gX_X&_*XCTq1x9^+2X!*y zK3fNncHUfxZj&KlvBB%b*Z+nO#T}^C(E&PIwgv|(fm%=qN9pmGwb9^*_4uvO=H?nA zH;3sz3WWE*%pOb_^+wFzG`ZoMJ39?BQphv0MYxS|FI@xBdDrJZ!y}D&A|fKpUtGtH z&%K?M*OP;j(2jSQIG*s0azaNUSID6w<%>a^zIGnRyMG=h8+)SGt4$28n(n;WHhWs2 zvu<+!$S|5FAb{;JGxLlMc=1QV=a8)M-cOV~BV*B76iXS_{`G;ikLNP^BzhR~C!A2O zq@-l0@_0Cjej63*%NnZ^(SJSQG#d!CK+aGhFv86Lu3aAEOWlX6^kChgkdPb&E2~+h zL};A*?)mwc@mLV0<9_K)Jut3)#?C_)z)new)*5fnX)jm-^W#mIswr9G33BE8Mro4i z3u^JZIFCzyb`r_o`B%<2Hb~Y=vlpOR=B~lV!XluFDE<+L{@^?}XVStmVlVJ1=S9Rw z3{GEv9}WQl!SWOtiJGFn_~zzj0!?Pm6sye*oPfza%#mLO8N65w-*gxMD&Z6NlZX_n zAK-V}E@8{JSv5#4u5m0ExXPDD{+^6+eR9{w&{>vF+S&7Dalp=odO`wxXQ5dNtSkQ7 zKl{yZYN*KvaPO{srl>F<9{O`~$O zK~<^8tyhsz^yEoa@RzI+%X(6|KCSzhw;K5c@^>@ChCc>>*IDRt1tzzE(&lcLEecYB zo=*$m&OBBj_f~X+jBnkw-s^8Bt&xGH;^ty5l79u+v+}zg0qKQ?H3`qQzO*JJ%3c_9 zpRpbfBx_hL=X(MCgZcRRr-wYix)YLSpBE;1e)*;rAG0M>4bBy6?CpYO`bi<-dGR&r zozcerLk3cW*qwa1F!Hw^U$FMLD6d@k7mvj0Fk-Y{Gy5p=i0ZEXr#!cde&HQC&-nk3}{Zl(v z0o3Nr!j=Xk9BHxr?8D)DF_DH7`0??Zhb5SnR#qtB#ZH7{F0Q?li3&X^EYZ>;UVY^s zQyZaU?D0tZQ)3_916+_ruwYYFff@xPQQ=^A#mIxrl$Q zAIZEw*E&dhv-Q@@8+TiojYLj1GYeKo`iOpty~djF8|!ph=*{!dKrhnB{GH6E+ycwv znU&b?mob*ABj%e>p(*gbr|_`Q@w!Ht3!GPC%1bQZ+q3Ec^{u6a1ym@WcNTRSwe@l( z-m9`}D0N>6y70fL9vvv;G`~DD>kqo9p8`}_g6hvj3P9G6JIL(p>O8NoJd-dhtcBA8 z_N;hJ{R0=ubG+%Tx1}KdenEVZ`T2^9EV|IKoz1}VG@3P4+JEDAis{#d2#dnq9rLFI zHe=~r5s{ICOk;5sO$DSPJ`MgdD4hAFxDua%Xp*Tte_)_!?)UGMEEu)i-LK@BSn4?k zaK`)hJ0S(rbKe%cCS!;dBUFgwM&wH?G6VcD5TS+c@sa;T85J0-tgnEeprG?gQz8nT zmuVLUKdw>S;}I;Tro{_XmvpW0E41sZE>pw(sF7k%U7-BEKgmS{_on+0vAb zkV={N2ZBG}7W;Se<)8mXW0}d83%Pq7UTZp-($|71EHRa|FtdHVb@t12?T=3#v{>#3 zYFBW-w=Xug$4qh;FxxqCaGU0-1nemRWsgq#vwv0+cKQMFY`Q4xCKbabzIk7b=ekE`8FBm$KSh;T5eQUxMTV@@xw>8Ytz z`+j0Ni~e&S#%VP28kz(iA|n3M$2OWAx;Ub_8DSXMB-GSTUZToA3k!>^ht>9pz24c; zqa+I5y>DsdEQNUr^cV#@f?NY2?P$v;=kuv2inv~Wy3%5$v)@Sa+dZWS-fls&`ztr) zO4#u2F_O#E-kh~7{#4)0jlW&Uz~(<7DtIvDF$DE3D}ieW{47jy>+Ai`C!Np7+M1%h zAdZ>Z)YEwNq*70WX`cbD1PLn_gAj2|Uhj8)IEPW18usNeF8>XIke>oHsYERYV(Nnt zzE#yPnL?-c`cV~7Hm>0F=W(&59LBZK{oscdKADI1LInkd*w|QR4i2nta55N5xQxse zv_d2a9|oVWN^#-PL}hz;w|!2_isSdg0*(FDI?V1pmX8f0JLsR>IhK&ZIS1ye%MUv1 zghw*SQVznUa%luoR)U4vRu)FuR;9@UpFMlhHEG*iD9b23I%_BFvewHq%jK(i3wIjH zNg3)>HL0oggpP4Qza-7PT#qu2E4RbS=lj)ZN3U+$Il^>g)OcHITo-#uX_rzd{TGA} zkg(Dw+<1kLg~80syk|ms1hsfU!|#M#8w&d5Q{(zse1FIwwdT;eQ^1Y=aVz%zr%0t_ zmY|!1);pG?1X|J+4>N6IfE2gX3EsxHqSM!$vsZ&90-`6_qKO5~sdf^M=!v*^k@Y%J|tGZ2y98?x+j^* z&Y_bAp1q^nnSG##IhpYvtrS+WX=|_|5_QI=9sk&XuVvodCl4f%zci(sTGqmH(HSMrRq(U;0R&Stq2G|}cD zy}dP&4js3>I)A@|SwVIAUdLeTV+!iLOvmc$M{qs0L-QS`dA669xa{6vwZdw)V^#F> zNy+rim*z3VIW@at4jBGoSgJc16{&X5`~`9+vq}TTp)eHCjvFKa+LUsS-u|+5eb~2z z{Xj)$!-T}AO_T9n+I;Yj@1#n#Xp~l|mlBV^pC$M>f4!$(7y`r{!+H$qgA4_lP zDAm`B`of$Owa+oWNh)YP7yg%~SLb zc0Plk4}zqile<&_&4E@pZDK`Cr?%_7a&O%4xMWd=T7`3u|6jm3Lwnga@};yi62*j| zz_}nuqs5r7$#qTb&SxP--1}K@cj4V#z^hlUYCrG?KbLJ@Nd!ZOZR=51eR~IC!No%k z5s0n7M(*1$FE4+ssOWcWyp9V!dbMRIJJbh?&DTiT|6(5aiHk(l`lzwa4mv9rd6}Xo zx>q24WipF??)&~+TXm1F_!!D?NBD8OY}i1$+3F?c29+SL%BMZ_)!(ILjC__*3o+aM z8=4!(XVt#_Bh{Hvh9F<0g56^a@Q*|)O;1J9DW; zzd>`hWqM7ud*y$69%PfGHd*7ukNgoa7ee8PI7vx7yp&=c0AyLuGu7)Ae$N z9}ibzwZ&NkK z*Z-u_G)`^L`x_4w73h@e);jv}u>&L;vvqPOK3iY5L&x+mwWl z1%BEt-OO@Bqo*G+7}xJQobu__UHT~MOoR?pDRY8nrWJ0jaicoO-&^kd;VPF zgPv;a1zA{==qVh1-!0yH@sl`BZZ1{Mr_g>^PEJnj`lX2mhglTSw>B#1^~8;Ls>kAy zy*YR+@*cAMC5eF0&HN91*3iREZ78L@;>{0N=k;n5kWk}X-4^){h=wd`22H{(hFe<~=bf37hZtbm6?8l+?~Jh-^5 zG>|IE5xG;Mn98aK@N?nIdL!-&F!)#%>>Jy6EH24SNcOMMAFu~V;|qf{G+xst_nL3a zebc3r(3?Q84${fJPZkScGSH03`x@Hx57n!ZtiSo)Yxc-No8`z?O1Dhopuv+H1xg_0 z#@Z;~C@n3elhx}B+uEL^di7+0-9plb#A=NTO{-Or=AZlE)wf4o14Eq4+gu0f#6$J@riT$ZqgubiDEAV zvgrE_^uw8n_G{g&yzsD-$#l)kk{2L@kgKh;JQL{FnSYm&ZY3(VJtz&^pUSU%X3k#q zq2eAFXllZyEn;R~4Vk9PMBF(jm*AJTl_ljYOQMz-+|8Ig9FDy7j?+&9?NE1{Pn~WU zjf1DR)`H|8;qS@IuKhW@k=85Y1>~-^H~AnI+fpY!4?CHgofOpfiv`kvVBRG2=qIb- zdUI7U?!e*r^YG{dRq9pgne|PFn&J3z{dbX#^#h5v&aY~DGdh|B`oYK#ps6&>>o1VM zI%r7o|M09bz6r2(WTxL%~T3dUkFr5r>tlN~l&xp-Ll9AT0Rd&g2*5qUwfL7$huPm<993 z=aO#!VF1eSjrSXBlG}9pTwBjYk~Spl`}6Vcsqm62gB^ z&-U){t=`bOMAnMzjmxLT{m;^Y8WeS#d}4pvrm*=U87c4z3gz7&o6SVQlGj+V`Y zc}I?lGUa2%KW}#`utQ~UxpAu}H&T=c1$nC}r`fswb{(VVhknk-)%^;Zm$X1J`AFb_4MWJa+^t-N>2@+i>>M02xm}?UdGjEv z!7&R6h(@s^JQlmx2gF7>f&n?*xxvJr&j(aJ&t>JfaWGdl2BQGD{fji*Lle+RzP zX`+pme$(oIwE(&PxB1~M=&;RvpdN8$O1=jvNuAH8L8g51Yi-=F$yVXQJ|_83E?d%i z9WRnE_xb2HUXhfg0aO--0!j_HI0Wqijs}mt5i9lB*|CMUWa@j#hu)pdcuVcF@p)`0R~l+FOD>cC2SiFq<&J%S{-qdBEc{sxE1cPWC2EMn5dv{= zbS*meW(SF^_JsyNoCb6PZ+>yab#qBM6&ACXIwSn?TwGih;aB}M^H((p0xXPhdW@L3 zA7`3|)DjXBFGQ})m(nCMbD_DKj-0(}n{6V_vN{TzNm}z8aSTd#r+Af=*HtkFLx~4$ zKfkUkOv&s2)NDe54ukfD+q5?Syr)Mc(`G~1ZipPieyJyjfSl2(Ie4KpVubIVD994j zbn15xSUT=q076tbxDN>V>>hr!CbUPO%~CjFyM1zC()68yg2v~x;FmgYjj4faBL9Mr zl=06h{Q_yD5tFv8FW4gF!?8B0ekRL5v^VkT$Bd4QSi~As>7kq_CMjtMs1=#LWX&Qe znS(;wEUZ-t3K~#n1ADQj4!?Qbm#AfffJbCy&WOI93 zE!?#UUz@dYJT@U?C$?DfiTiV|#kUbFlt69uK5GWkThYSG2N6OrNEt|ZuA<)e%(WdR zXu5VPStBdns;(<>{Vb4%u*P6iSeR7Gg+$JQkH0!FpAxFd>Uv|J8u&*76kCWOauVy9 zicenEx_eyC_kvXJz?Y)yRFbheIDIAcV{~AIG_E!c*b8cjuCQ89L%Zg zSgS}-x{MB{v_=^H1V*IHT}sG8#? z6H&wWqm?~c8X9rmBdT_!-oURPYQXXZ3_Ol8n%MT9Pzl8ql!r5($#pe59IaFJdj-S80p)&75X;2)pN`FmIpWVvg zfHSIKiE7s6q0SmrUi0}+plVJUTH4#TuQA0o0TZbmQ{ufo zsu1K9TJ8Hpg&mG{X&1Vtb+pk7y!^%S>XpLH!30SERPBGWov2@4`yGf_EPi#;K{1$QOa)!TKZR*ErJl}7q1_>&7U_L@e6|&03b?OgJIg0=KCvgg66a!ZvE&;)@ zj#PMW=%{s{wU9}s2hhmHw{F7r!iFmG@AY|$S!vVFZMt6$ShQ=R*nz+RoPRSP3(L%7 z8Iq{M!9hXuo>15P^HVheD*VM`Uo1j~)meqVK1!Uzx1S=?kA^BZ+1c6W=H`B`u4)hx z6sw~)2z49k5fV~Teg{XTzn^J6M}a?#)BjbvqNEG+x(fS0e*S#Z*Vo6la%b~DrtuK) z;(utOh+FdltSA46&MJjl{{KOl)c*xat^Wg4z&$uXk9pIKai@83@HS>d0*F96*`-Lp z!XT&$eiViWwGyC;sf)A_qTxfLl^QkB$!Hh>xS|kGS%yolF5T}E&d)tjdL)#R3;Q3A z%X)-pYkND;-}=8MgY+khQk{Ns^I0;!XB0Ukplo4Kvjb~JE_`K~2~l*_=GI0Allvql z`X|~x;?RZvqd=)4P{^gUf;1Bn4(SoTeZW5LMs7=d>7SI5@^AfICL{Ijp(g7|p8L$3 zD-oMX+p;~zqXUbF&y5WYr&ChUTF2kdQQrAlLybk2?rT! zyu8NKy=P|7F!+0x^+4uB{+nz3JcDG5Qy|X4!Yu1yaF?2>99;dxG&6p7VK%%Kt&p$_ zk<&y79?s2;2Ln=HdI`IfmZDn0AG#1Ux>l@1HP>5UkoM=sD_p|bu#c3%qxA7V`2Bfh zqU&3rQ^SDffk2z6Yr@Z+cjMusvmZxS5=K`deyx1`wcY!3w|8+luBfW9Q>U^Vo1608 z!rZHQyTCJp;%81yB}-QXXu}X@-{mS9>9+XI$i(ij^7_wR<-?Cq;n&*Qx)yLTEPc0M zoR4LJR#a3ZeZ6SafkePH3Zy|V^>-N`<_Qn{{!8+`=J&MeugzgW?aE6yrQGBSGJgmb ztUNo6RK92$J_9!5nhoy#T>kk-HotwoZXaRh3AqwW6+P2)eYoA`3G61DwvmiJ*;NhZMx?+3FshEPJdqu@+uY|kF1f7&mBdbyUas<Gbx^HTMBK#0acFiQP9@vopiHS zM+Wx1&zb3Y=K0A-xM6DG-MYvUwUiZ~m#bQ@?V%dB%)lONtrd1y-zyDby}lai*u5K% zgrcH`w!9a|;AnOUYBL3luz9`vLG}ihn{n7IXTjijS4KCR%M3pNpxwVT-R&vokQO|8 zE@KknW+tdv>TGII zJlWw5FGo_N#E12a{LuD$3|T)qB;@VQ)(U(Of7>mRnw0r`>?wP8d0}vl7#NsVq(Iu% zF%luZI6vV*k@}E z&FPbFJAEeO{!U69P5HSa+)~pc#kwQFV+G&6uF2&=uh)Er{U4Pe&yM;i^%$CIwY8Dt z0F$Thcv##$DPQDAIO+KU^y5dB?>;Wv0vAMO!RM0v>r&aBHPX|E&^~hhDU;iD&QfBt zRCW0)7~J$SXv?e4Es!`#l}lW1y!;EdSlV#>Y>=&j@R_cw-F%4wkBRI!H~$-xMCgsj z{iNm%O8u9fX-=K`@}iD4g0MaAXBdVNeRgd30hjDzUE*}T+1_tX;ZxUu z0``!*P0Ulu@x@&frTTF+@NUyV)`WSUA%?n>*Q5g1i;)cy`J0Tx=KK)Ocs?qno-H?; zRDLj1oj1G^3Rg3}bVcUL?lAUJr>yFlOJd{Ew%&$}zvA9~0Xzk0$A)iI`v-sq7$>tQeP_{ZvTuk-E8Zr*SW+7PgT z&z{9Y3lsYBH%k50DY(HXHPLUq89Un_?g;Nxmp5G&g4!pqt#e$&pvUK_h(_k$?ej3RxIgeUw>J4+|POcTYWOUSO{-Aw~#R3TqGeJV-dabyj3&%DQe?NYqMF zcj36`K7zLPjiW_fz_U1L}l`Hs;##A^kU|`AxrfCbUXx$;7<^T*d zFAS>R&ii!S&vSTQx4I|fPjz6I)w+$=EE%9nBXmV`bo~8~Lj;`HUg>=(-03dR=*May zbhEswpk3^!AX&-A-<>KP&O7t{{7~@WlZ&0B20J_{UB^@^)FD}Vu#Y;{pd(|avo0=6 zY)99?iS=qof(HoXJGfR46q(wQ0+6Krj%&Ub-@RdO_`P6}#Q4hp{QZlQe}n>@mWu*? zHEp<*F$=iD^uocp%cYBqTF`h`8= z`BhhiN*xU-Bw#=uv)nyI{dkKeJBJ0fKqCnXskT^JOzk~*=mtT`Vi($Rj!q~*o30KOu^ zL%Il^tzGJgHZ)qEH=yuOum9ZxHQK!_K6=PPx$d94kvOdR*qD_rx5%YS772j{CtIUJ|=RWC(W2d}GY;^HEP3rd?-0-NxKfgjk zcD9~wNIm99O3h72ak@oj2Av+MpQeF(5-^|LVXiNtMJBo*^SXnLW2A!Cl3c=@#d@+- zdIc#?g2;j~?}xh;#=84u^wCZI{fLjci4k`A(hs!mkJN*omQ`#FQhwf-W8QmyldOS* zt@S{F$bMW%{xOJWVl2WzzcJdzC>SGz9$)aidjmcVc26hXMpU)FXTG+Y8ZJtCgVu3& zf_0|?+b(Zjjp#KuSJ&dJUjBfvxu!=0-hhoH-Wv^fdf8eveh|#%-ZVNgmk?X z;~!@@HYp=-*SgcHXu;8-x?(X@shA*m~??nY^N+0PgOswm|}+{ zVz7am^9EsM4I#?IHs{XxJO-bX=Izo9@-}U@(~>Vveybb6&2cF@$f`|7(%M}&Gyc@a zi1{HvM-q91A4^M1w8QdvvFkJfL!~NA|7a(10(>msP-Wv=9-~?;3Z6ZX>U`23&wujk z3_-QMje*@iMXA`j>dpWgi!x!KbS*>LH>#ZE25bxF@TsSLXse|=$1ow^-Q;dOaY_5< zEKUb_JBq7Qcy3)&aZ942-KTwikSBm~UtMu~K6SIe#4G;ENM0v@n0a7niWdZG*wB>j zfrse9u3)ruHl&Ex+bb-{oK^>%a9p#?Oc>FcTAd{;a8-xa$(i*PigixnH7iUVCLUFE z_Qa|Tc&<^605+)$fU}bgkGste2QlfA$jY8ZZz+_voNe)Z8<0MutTaH{Av)}ZrV^bN z{i#x+hEqm(v+f$e7&++T5a+bbhW+=FFA|w6ln-Hq>qz>G<3NKnpMryK1j8Vbzv1L# zT3{#|Ry={iw&+KBNj!Z3*Lq|ABR`%6leOSpX>zI2iWv5wE&DR9AG#eu z+VD9alUD7iw}|(AkxgeM$%O-Rn3Mpi?y9>posP}s)#ewJD<-e=oXY;>Q|ZqJfXbK8u32IJjpNIc5~`C7ci#} z-t62<$4JP*ik_AdI<2z*S>9l48^zEx(91lUoa~iqOt#E(5R-r#Z+O-DL0qYX zaaC}R7k=%r*ez}IEL$>P_|;tWa$itg|5oQQ@NC2FHBp?e@BSk*&D(J^wNhy!KpnsG zVDcveg_WmymqKA><^B-uZmZxv`AmKS9p(d6v|2Wv@Z;T1J7)ZG9SbT~$gY=A*5IfX zk5@`Y7JQ7*pjb~?{1b-<+dM|)9*x6Ag9*;>?#rC@?tfN*Kdvowk~_u^t1`fAQg%VM ztJaX6hX6^Uv){FTcI;M^JbP7gRXZv2?EruhpT|Yk*?b#vyl-c%Y8<-6Xh6xh&?87Xaom@h0J_wp>5MdF?<&^aAbubkrRTe^JU zSb0ya5zozTw{QUYBA4#-Nh=LOXLj ziz2N;+M&TsiR#YqHPG=m)(rQvi9t2iP47z|X{P-*NC+2yV^Y_bOeA%;lu zij4v)e>;r4wwBqGiBG}^ar5ye>Ghjg<(!-q$jit}UW5q=(CmxP6D3g;b%4G2L?n*mN9SSMH0^*rt8 zm3*ruF|cy#-+=HE04v7SHM+yu@H|4i zUX|(P=Q^zZl4)k3LAeP4=!&a%pURkTYPWZik)dJK5F^pQQt|q^@2)O`80eCUdo*o` z^nISXd}FgZY}62#aWu}`Cn%V>vu`j{KA=;_+Wp`I(@Sl35NiSvo{c1s(b)Cv!~cD@CrvOROeM z>Z^JfVq{mbC2Jov)%I@?np5Yz_SX?g#D4lFt57?rmvV6^nbW%mjVB!23fB#px(c4r zaMC0wINzq4w0NUS+A)jLdrJ|#NX1^@ccnU*NExqI7fzU(fQ6ftTo z>n_z)Bg+D%rIi&5vS*_Lf$Wz1s`Kt7of1%kW%lF_xzrAC z`3eCdk;xX`${xyZ%ZOq=rL*x17)qp4rjPB$nQXXLluItQEvtHkG&L;;Bt1A^4Dhx4 zdS1{qY6Uo3IrY1QFV9)2FQ>M)WS2!*WWUCWne-RI5W?sOBB|2xdet$l3SldgMiUcz z7s1_o7q_QjyQ7c?E#~>wm5?r4VZ%Bf*c)JN-fVU)0iA{h9zemtgV-j?^N3jY=Jx3B zupv%bPo_SHcW*M~pvW8I*5X%!q-pb7H>=e&8~YMlWDw0E4B2G+qQN)zJTH6FRgEyS zFgLD#R3X7jXUzmg9Z_m+C@z8DHg#+$-tI7b3njKTgVc1DwH_ ze`6VKTCVFw!#gl-G&+7!%oQ9&3=4+J@A888kuRkM($7?XrSG1UO<$)h7+RM~tI|Iep17$A+eM9b<&3?c0_~J)IT&zO6-cD#1bU}c|`Fw4Mf{_)nO@Duc z4Yo2@+>fNaYBOIWC4fXe*hk7K;!;1BJ0gSxKUvB+?H~QwkI_oYwBt(UN4vkD+CZ^?uFp^+@lOkD zhTx_Y#gP`V9@(%zx82Ek?9TAH!P<5#tNOhZ(6+_Ir9%QWk;cubwN4rj8I;H2PKE!y zmh_P9Vw;#0wk_#Aldx;~1ROb6^@*4G9n_vS&MG^85&uzCbOvBhK{!z%++!4 z5f`FXV5f#AZt^zi!JPq*WADZCFU9*2iupL990BUE4?V|UP`xUd6~;eFL*pMJH8&e5 zKcaj2;qBRuowq%kxZ{mCTTbLG;kVEuJV6CDy!VCxwGpk5B+Xtgf1gp;%px$p;}`l5(ik*~C?=GV%PQ)#LeE}AkuX?ZB{wz-zRRd}81yRHYpGR= zu)GLB)2=ArjC;q}ODQ>xydpp#skB@kZ!erqG61_ie7!cOGQu?86DVcCnb&Og!lVOdcK@ro0!^+2J=Z7oFPF|Tk%wJ)*&f@z;Q(`7hv_y| zvokionuGC(gXD{w^j;Uzm@8=Vw>BCmNO&$!T)sxoss}P?bX;REesLSz;=}aYjA6jr zi_LV;A%TLs?B|=WRW7A(@!ndIKG7cE-ah7t>nBl76nJZl{N6qmyRsPlwKl5QzA@5L z7~n1`ljkdbYM}6h&~cL>AU~a6O3mTKjV9lzhf4E?+Fq!;$VWhUmZI;%2_)zt@^3mI{;k6VYTg%*QpP1UwHNIT$iWf!(dI; zy|B}x?brEI(Q|KI$YO5I&o3qXo%mz~1bjtq%ilVugxrlZ47@vP7d9+tiUZix^6`RB zQ(^}?8dF@%*aZ>sCh1CSM2wbw&W^WtZU_XQceflea3W&K-qEu)yFIkMX(@AILw0S4 z24!o!hy-3IQ|on0LVmeI&r$NqDlX*g%F+Ek2Joh$I((S(?2p~!ftm+?gy{1W1s~JC z0N7^#TA>gUPI_y4UmGvx*wB+;w}JgI9l|R~@m(%r1Vm(Yrd?>@@yNG23R>ejr;Pb7 z@P1R*0F$yX{!*yb_^0rrfP9xlOpBn;WvLXid1$2mPRmDj>#?Mq?&5Tq*x01R4D&mU zw9dU%vP~<;i_T5lCkv7Su{S5(HY@1Z{gv$_H8dKI&H}FO7{hfeO%q+w2}3KKtvMn@ z3BQd(9}=QDI0uh=0VER-{6a2ArK@R)wgIAuC<2QE_76btR`?o$a8qbG_N- zG^v816rO=By!B$}aIeV!!f~EXYp45%PHf|=gLd`ohxx(w4xTkuLGNT|e`7x^t%s*t zA(1uJSU5uh9K%R?NU2bykcBcXR2n2dG_}b?L!Qtsp6tT8WsusKo{kezYOtAe%QLa2 zxwKP9*{Maty#^W&Ll*{YN4~Rsgs%MrLdjz}drhKKZiLeuFe0auls1ktkf7(>%duLR zx$Pa!OLQtJ@oux%+ln!OX7f@hr#3Nr=%PlmKggd|eNS%q{4?e2T>liuBz2E*UVmdP zW9sg5BZFr*Kyg0=y~0BEjib{6aiG+?8s&V%+jl1q={@oar+V+Vswzv=AMV6?1yvHP zfd&-RZlD-YftrE2!l``|men8eL2|9Hg)~zR{KHA)MsB1sGQHW~$GCi60WZ|hi^+}L z8J%N=4YPs~x6X(lm#69*XOtxVtA>h0TS`2^-w12?G``kMdA{wCA%yq58zM~Zrw}BC z_p#sGtGtk*=g`SyGYJsb^f^5nuUHawXAb!QdeeQfkXGsJ0#9onY1x4;#McpB^ypRw zZg8nJca3y3*jxeyJG&gu4=<0_Gq8se2Yx!s4)h6O*SPRA$$sb8?ecO;TL8-LTXfwd zQ>>giOoTLqQr7^L>x4n4pEfmU(p86|+iMxPn4RY)y!M()At7&PJ2pSU0<|RhF7g+g zAaL)C1q})mpb+4ah`!>_GOyYC-d|hG2rPthChSzekjbZY=#_x_gp}^!D1{TXfPrA_ zvE=hCKg2*P5^C1!qW+&<@4kouFon#W;Vk`!!AGXs=iO6vB&?jhY#V81-G-5;fkibn}=Cd3~g)grhskA>)oEP(d4u%Np$OH%_~(=omcp+a^A z7s-DVPdhY_PYxvQSI|HQTABo2{YZ4GaO#VfBu&TNHtY4&&P)19onIn43p3y7A%`F4 zdt(P^FAVDWqAfW$eBO4{;J^oE&4eR`Y@OLk4V?_TXTxaS^~vgBwH1StGVl4c0;M;u zHhlj-_TDP2t+rbmg;Gk3wot5C(G<7hR-i4VXrZ`!ahDJv#oJ=V-HH`=cPDs}5ZqmY zLkNLC?^^3!-}`-MdtcYSuK!@qlW;J@oO$M$V?1Mw`@YBX%H2|G!t`{pA#Ev%%zB*y zf8XzXi50tx;s=H#CT8kM&o$SKG6&bYbb7HHhdiPyeK_YIkjPU~;yaLgSfO6))8ZiD zmX(JCvyw{M_GH7P%+F`a(bdh}p52S6Ss+j71r=V|h_^{ZJXc@}$#qer^vVbP6FqM3 z$Hi7O!okTT#zRdJ&qXDkJC5aXaQK_h6OdxAGOkz9jxx7TdH>nppjHt<(2<%@MXzMC z5p_KOc)O$eQED^~M)T3<*L~9f?C9QU5!mpxRZiR(w~uTm`+De64nXW@EWs{B zwtDXUY6SAqEcE;JA6c#!MYEpfRhDMYlsgt$iJ))x3iNET9${^n)K3B=Xv&Fx7o)2p zUJr1H^5!|~$MdwfeQ1d&@EZcW)jt_3CmKq-QSx9oG-OX(j%=y+g@^U6N1Aterf9Iv zhVYyG4kdz?Sm_1hD3kqcWC@$_``#0pTtP^o;Im>?;>()aO3HPB<56kfo^#ib^6f?Z zS9?d6;-%7d%i_V==Jwl#eXGW(jhGYNGVq~rHFj|=0%wXWfH6R%l`&NbN8gxAW)Q3U zv>v1sSbcRhV!O@yJvFx7toBzij+BZa{kr0JkFXL_9T>c;qRI?$p1|!_D$azBq%QgE4pHGh7R3;L zoy(LUq%l5cTEURoF|E%W7HujHq;>VmvD=!&$LgWaiQtCMQF)7~I%Y8G8FQ0b2^bw&@=6u}}Cu2tq`O@vvQH;>L2VlvE zjPKLvnhdan7TrkNZ>Gx|P?z(p?Ci}ummbaO*VkSdz%5D{nUsS1#BQ0-rqo+1RS?)7 zlZV}FLB^+iNlyaB?Q~;-M@yzs+HaiCY83-RfAaChwH9ZX?b-sLC&a5K#dcOJ@VyHE z$YSvuJ6yAiQ8CUht~;Td>p+Z8JHx>3`_)E5at<*iom$6Syogr_OC5Sw{&)AW)cR_W z$BYgSbFrze&A$`hzh3rgK6_uu_zOyg){ohy2M)#kVLYdh0DaKCL_U;!BY%qX+!k5}h4>+o3{G z%w+5y{C2Ml>lu3-lNhzVlhtvTS20i-bm+PEQElPPf(M?#Ezt_2=rb& z4N2EF;I1}iP?^e!gXHmkpL0>l^XxMH?JvXmj)d+>FP}k~3@Zsr_WSq-AYlokn!Fw{ z0rX4$sB%6J>eG=IYm82D24nu8pQW+|Xh1j%-;#GENK<%6E5Kua2Oo>jkgdZomaV%3r<1F(L-InR;G}yferq!sb zhI-&t2vq~COMhZ5^f^$RUk#)HFL|^_8DD? zUSCQ_L9)O(@dG_|o)Z$eDoNjGo>l?&J^Dh!Dm z6I$GwsqpXG429`Nd#*ec!q(wIJlI)orjlc~1v#BQN@N47L^J?GDg<&Dl? zD(p-BBvMI}pIZyDA>Qv`>U(GrO(Pf!y8&6)&a{j0d~YN=T$=^J=^0-olkR8OT=;=| zW5;VRd|!Og=07-yb0IVBQ!@E;n;WKCX2_OV5H}Z#O`UX4X)&$*zBKKflnRl9_6fC# zBo<)LX2-NG{QK*%GRnHLKd#-hK}|%fqu5WaG466$0Z*4 z3FG{dH9mgrHSv&2tD}k0mrdS0XmxkHPTVyOOnA`7Ba7sS&55WsmKqw0)Vs89%~uQRNKGYX$wC3l((GXhJ3|1YQ8djginF-b;Qza z>i4Gw=YAS|+J#5)K>N|&B>@F556{(z{q|+S>RT1at|B9|B&LM zZA%Q;yA`uLAD&ZSEQY?B`M?t_^JeUlI>0T+4BTV29BDERfB8jGF6up3V5OCcjC#T3 z)hf7i`{b#Uzk65inW*#4Ya?1Q`9ut?#U_u=AR?pmDA7zJmD*J9Hp!1)F&XTo&j}pb z#^>|c7c+XvJRh9Nu{~}!ZF|uI5d72o{KFdfz44Cj+nJvQJn8zf5t!5h(@eRjsvNml zYZ@uPw86oTgw(eTRU+;<{&&T7u%}B2H}%H>t>RA{o0YkUK0f) zOM7Q-MfyyDH{Kt&>wii-ximGl78LFdk5%76q~e-%eLw}q4HxxKVeL6`j0}mpj_-^r z&xX3E5FL72q2+h&R+Lx|cqlrIzj9$>hNJQ3VviPlfaz(&X1nL_%(Z$?uB-BfU-%(x zpL}F=pElkJ7*l_gpUOk=;5S;@O_KKB;8DwU=E7?~8M&y6$rs{YJY0!kNO*f@=Kbw9 zT6)Oy{AU?I?GJwPlkqBppv9}#>}0iz=UIP;Me>QZe@B*n5NrC;Yw$1j4f?M#$N$je zoBR(=zV!cuS=RiAEMUQZ$O4LT{)a4}^}oZUqm}0jp&&v%x(VuR=i1o*U3PGi4=v7RBu9j1+y3j=&{ehg zr~2Nv{};FQf5FQ9|F`h$*=|jf+d0NuN_iRKm{#cTxp+AXKdvM0_ACYX(mBwS6)G(& z=I!t9+tKZ#^_1U%hFA4RkY3+CFG~Mez-6|0@=9ZW; z3?%)>7o+!k>Y4CuXDL+(8fW1X;6t^Wg%}LXN2Ao-y&8rv^JduspQOM5(JyiTcC(+_ zYE59#j>@O~QBrx1aMd-O@Z2(J_X!s8x#uJ}a8opT}W=eRWEzg;Ync>E7m1M&PN z-CF4(=v&L7W=VaY6Gtw2BoT$WS#}JD3P-=W%`W+!&%n^e_k3mnwHKu~A z0gVKMwH7VzjJPiN4$k24o~1Vyt!?y12 z;-9s;qZdxlZL_-U%0uCg|5$)$jkG>!y1+Sf`s5}wbLI4AN9fB-$;_y=02!MFhl+C+ z$H7=8^VV_m%&~*J`=#5(<|p`18iNeQpSKHU(v`gWg0QF#KK6@#d(j+!?aZ1R9@b;i z(sVUl=ryP#1%Ec->HU(&pV=`#me}L4+5O0HB#L$rxvktzJf^;+)j8N{Q8vMXNuu z*}FCB=hTJ?%DObal2Gla8`0q~I)8jMJo0k+kD!t#?$J6dJtYv$UjE;sjP9@NU}89? zM=d*|9(ukZ@A}11SX!9f&ngxVV7ugc zaj~N;thK({8$cvJh_j;$MTqY4);Uo;Bj( zIr&)+bauOfTn%vvP@!J0=e~V}AHq)cM*G2}$Ahlla9Y{-pBxeWfPGPX(8(fagRRebvpdF;F1H*Zb#7QWVL|b! z{?kXQ28VqVD?}}XbP_+Ri*Pwwy=;!6Dq#Wh9;*4qYX!M{)@HPnIGBTwWq60gLO|n* z2OtaUV`>prAojuuFN|B*1&qM@wz>hynNeuUuq<&;8?r_})Q#oed z!)(@Yx^eH7Ey=Y08=>mi8Wcb7kT~p7AFri8pL(of(pa(XnQDWjBvI?vcf!skZmB0Y@pA;^O$DVY*b3 zYvZpOX_L|SRYKb*Crb<;RlIpZ-x7mB4@g)5{MP26(*CV%VWF^LsqGkWR!|imxkFCy zv9xn|J9qFD#2J=C(>~#ej_(uVX&BGcy?~1(Jvah$SqzE2Q5+_*G#y>J=gZLvpTS)c znrGD4Ro2oxMXVBI^byhw49q{$Pwh;tEDS$@T|Dcss(Z)jm6U12W9xV$4p^9vTuvvuuv zt=PT1)*A?E7^(At=}5J8$=s4#r-0|XCx}#qf#o#$t&IXNKWV;CP|dk7SeJ{lB|7n) zH;W@=erHXu784cv+VrbjEl-Vh3zdqo=0C;dr*JmwV!##rg40AUBAMk~`m=WQH4bpNcxCnkKx)5Ot>7#VF~ zUcl*}b#Do25G4Hh4>qT$YG{Z_#t&S=`!%WGl0FnyGZs?_pU*Qu5mI+NBNm^;vc4CQ z&HCC+?)~Fmin@I_3lLYC!wm4)*6XjlC*6pBf3-W!3OWo0PyR5vmdvm0>9i-N%5fFj z$Fms2L7cgtG_-ZNhRB5oyAe0a$MlRb$P$5nYC=kmcwFeu4_^)2>;_e7x{8v?jp zim5q)-_A zjeqJ&{pJas&oy>o_xcOA`eW5P{>0V`=@j?33@7xQbcE_KCpD|iZ0J1t&=vsJ%z zo>~%Gz+kIyo#VPxJU?ltcGPUpbJ7Z39VsTj%fL@5!l(B;fMwk-RYvmzQ_-BCpZuK) zq9dY@g1a`w)?HH^ib!R9N~pgAwCJ_B_q!9P;{=@H-xwO4#cy@Jb#Bd8TwLh}_j6cY z*KqLgG3bs!;RJMc)OCB*9W4dU?C05%Gu|UNa@=^)Z-_r=aQ1X=?uPaFYX^NerEaD& zkj74@I_041X6A-D_VQMRO5-bO#TEonGI!Y7$QX8jk{_ls@$dMeW~oTOykXuIp1dH% zaIJrAI_4YQPiFb)06X%)c*qdPY*h2e%V48DxicZw3df4x?^10de@5kS+xh2xKeO0*RS7em6Ug_VZSzmK&$s=0GANzHMZ^{}`#Uoh#`Xe}jr*7g`k^0RSr951?;Si$ zn@_i!)m?+LqeB$`)=y%n+x$e@G{=|-)f;q!CK`=KbiS)AMmMWLF?p$|a`JWrgKb~g zIbDuz2f9>N%|g2gf5tlOD_oh4TUr!!5w;c6wutQoOO2H~i5svbdm0>-3K`p0GEoe| zrf$}*HGXD$iI5#%qiJUK=u0nFWM6DbIQOIq;MxE7IYT8?{&x0jO;K7G9|j(r4`5{{YA{W$nnvG2kkpT0Yp zr-mSZ^enDobFsxW-ezl+Ux$yejNTkCoM{shtQhede86ck*=GQ2)mnBBd%&2Sa`|9v zOxaG9#>|*0miCTo^@3%I%CiYDDPWLt3$DpMC6^f-4RA5Qkd zR5Lb8i_PGj+?AQ4XsN`n*hGq3mv_+-x@VYLyc(t&&rNNq=t zy%HA!WQR^fL91^k^FfVKuOAv;qI-y~f0C#)^c!j3`J}}T)wRAu3t?rSe-OERZ|f&Q zqnu{QL)B@Z`DV3omqL~jZ1i9?QGMJj!t%DK)13NMM?|2#2sVs_q>{3;O)N^{0?zF7 znYglU=u5o-;0MgL!o(ZJ1rGZg3jl$Ae{yHdK!ryubK39$+P+0ZeTWu*F817VUjqVfOB7GJ$HSY8{_q})4uDbb z63PZHLSL31ZHQjwZ3pFK+YBy*9#e4(mGC|jaNs-H$AxV@L`G7dvJ)piUo(bZ8eEX- zXNS`YlAAgbFfP?B2v zRMUdExpRS)CqTO$*da&1GnZX0@icdhJ(PFfw9Ds6c_qYF<2$Tw4?WI;Q3tMgm6-`T57FrYfEv-~{u!}yvDr>lL>#?i7?#5RU zNloB1b=K@Y4})Slhd282@P6uq)tAwmZ6iz~nt@qf*z_SHc1*0_JMHFIzQ#7g>c%>~ zT%RG3`^c~l#$8#XI?NjjiI4z9F+@rX`*T9(eP(n{Z^7B8c zy_5X8M3%=|pRt)^F6Y!(f(=S6i1OfNEGWCDmXVfw)hhf_tP^x`=%6O@Q~J?%sEMU* z&LG>B^G6lK&d+{*#%pjT1{+zH-iCo?!#6rkn4rO1Zn+2rhWvPeey+sX7m7}v$B8~7 ztembZeG?^3kieW06E^t(87k--TLm;wf~Axmt=^Q{eHNDgOs{<*0r`d{6}qA7eW#eA z1YFd&i>;fuy(Xoz-o5@urt-lIljct$Ge>wc`051_92L$AcU;L|Z9bv)#y2Vrt5^hY zL#0rdR;8ikw#CkiVx({-6k2;PXTf1x6^Q*Xbe!|$SLP_m1ukg{j6UP{Dk>JYv&I7% zLq#(l0*dO7lNdstx_a0b#{x^1_1#Xgdgi@8%6vnK`cMjSX(m)8AGk< zizRlW;Vtrkw?^}x+ozpSgXdTjImtX+F$#udL*@%Ym)*_2GP*i4h8dI~f+@U;~f5~`_k9x(31 zw|SMyc8A8d*x7&9*mSPov^DE~{k636J)en7+P!c5W(8aI!2NB<+rF#jb5>#{Y2V6p z3E?A#I$yP=^VU>`zg%=(b?Sk3_o)z`QwG6vbnp0d!chk-n9C4@&Zlc3-WKGaH?xNS zIMFW3x#-l22TGJOB}0Fzc8x_slKXEw_hmn!HIGNQ`QW=8;3OFIfh=V#^~_Uc+JTkX zZ6P(-^{wa+SX^UtE4kSA-;MIb~h6dcF1)^S$h%yFSk(b^3W)_KZ8_hJG zoF5JqUO#(I`)VVp+kb358!j*Qq`BUt)Ww8GJQwS|9fRed;?eRk(>@W_5(A&%QpYuU z{xgzq?cN8KB8qbe>Qpw2!P&UW$9mAWBvPi2KamObB0VbX+XIi02~`fx41#7np8^?7 zY%wt|)3&`}*+GDj z)suw>30uM# z8J9P8L~4z^o$3)8Uc8~)oky)2oP;bNk3;d-=EsAxxE~Df@vi&e(l)Frx{7zG1}(~)O|s(57x?Fna+oB@bL*% z*|)b34*6ANwf*XU`+4GdpU@Io&!*04fliW{l>XaQf3mwn?w9u~&i9SD-_7P)|3oZh26in_ z*G+}WL$kk!fB%vfm_@IMawcAl$Sk&b-1X}S$*PbaR9CY-jTgRg_f92tPME2L4Iouu za)3dL{B&?wo0R;o;@^OKNjF>fo<8vDI955*vdxEqrhZJpd4$sY{lc5meY|7QyKjH^ zVz^y#3pu#E%^$Dimo4s+jR!m06zw`RXXk_{U&B6IMhr9ihT=F?S{OV+XSO;wE}Fhs<_tbxs87B_a+SY#qqcnN z4!N6-Q;W(~`K=IGq-TfA7Q-WnPk#xRQHg{m8qq0kHQ5OsUWc2C!5A364SX*f`arOFb!+Gn@UK+!w44KWfbriouQCnye^h?IeGuDC8mvfB`z;ojEyp3m z)cr?IEGGv^EaFG;sAXzynW2MtOiPcbD_ZSAvzrvU(mZwTb58qokzN*_hA!5iB-mZ| zinC;h7> zIOVRgN{J4Xy<-~GR4vJ$V?|3?@HR8q@Z!>H26nu z@wSZ=?U&yg9X?qw8hvQQNzwFleBEPDUu@qCqx&dGe}g-7$gDi^H4rLz!U~m$RlMHhCjW=TErZMAUjP-^#3i>d zKbm9dH$umV-^G&OI$>yRqbiS^M7{mjKOc@IAk+qloykqQ)c%%7C=?ZL*IYoq{7L6& zNWVYYL!UehqmAs&|F;zq{q9kY->knh`0+HhP42{6Y#FIe)@o)d>iyv#p@-p=(|Hj+ zX{RiPc+bcE{D6G=&%bNnI7U0Z`=>M-1A`0m1JVDO4>t3o>u(aze`UzK|2H%O|G#f{ z?M;|Skdt-wMWE>gX`vX$cDxuu96{F45w>J%Sc zPrvqQvz^QQ9|t*5@84;hWjj-LzVj#Du+{rdE2#eRaCzog=0wZ!1G{Cy>FL9zTJ*`c zfqPJb_Ynbr%odgrItg9>Aq{!>lC(AlnPPq!df<`mMB@Mc%kOB-_VvQfO&9029(j%g z+VexI2?ta?nBuZJvvak3#iQP~{x2Iy8W}IAyWIxYSyvnRYJ8k2GuWBGdXH`Vw|$VW zoFRn0&nW~1Y8QN5y_(Nw+iXKxj|2XTv9&AgP=D_kO>X$5nk9<16#W~xf-drg=`sC;^o4{oy zAIZ%84}Z(rHp)B-L=O(|96?&Q|NB_H-1PtKcNZ(j5xx1#t(CRLI?%PV|2DrJ*}`+! zKI;UK(vEtWK!3hbe6NJ{w{+vO!+-a~?=Y`1!XO>0dNW}Tw)kJQE;UZ}?)Y!NsCr_X z{?F4pEzP(7FSKj!J0y+3?NtVw&A#Zr#?WXfqj^tO#C#7ju9;g-Xj{$%OK!2lzkkiB ztT>hMDvt--Zu^(EMJ!cT+(=xnbFY%APhOhkcg~Moj%X899|h2|BwdF=kzR27q`7u$ zP~XzA`TNPkyClpp?ysw@+zW6Jyfd{u=&`!27`!9|Y*RzWlGKC`<@);bJ-qjeKX2jj z(Dd=;{IS%K@!sQ|rQhvB{Kh^XCQmMDG4Ia=#mV-;3ugzFx{v;hL~0DY(niv@oWP$# z2c7k!aV4nQjRQ8e`YZf=Aoc#0e*_(Gi5aMBK||(V7xrFjTwkE+-v&1KEcl*+bE2=Q zBHVbVPG;0Ri|3n&tj=IN&EKg09K%N;QB^B>;vXgKd6J44__iblTEhhElt76czD!Z! z=kLT{BOKQHBnHCY+0MF+PfMm7VS|nwuRFWZ=`&EEv;o$9`|QZai!*riy$Z-+gBuhi zM(b5+jnrT^M-vK~$v0K^O2MN#9Yvw&j_3kIRn%(<>+Q0{UCQU00*AA$+<~F*mwIW= z1flhtrg}Gl3nGlMYYGxqv^bz;-H`xFBA5Oh&~@j*^3TfFG-&w>`)an^S=exrkB~Ft z(OR|u2!@!@Kf2pquE_hPBu4yS_8cwv3;o2)VpB8a!#wk2`MgKY*6$=H4z4NQL_!+y~2Y9eZcx#;bfIwCHNL_*y7+JNc@? z5qH?7bCSFB`;C1I7g#SiLNZ+3^}zvcR_J{t?*l(4q37l@7VQ0npohaZo*(uIGVMZV zmI7%}CJWTZcf@z+7VKh%Z>}Cw&|>mWd7%9e*~vbZY*MCR z=*dXnx!~vRvrk&vwig#|{+NnX6d%aVFf;_*?YJ60#EnZN$&aCktk(maX<1)Nd16uS zC-A@(MaxISx@}s1bfzGm3h<=BI@-=#Qarf6hfa(XC)dBm1Kqz>%SXk1P7R%C1(p~^ zf4ARNTnj9_peRe5-|Py!n0xI|^Hj_Nw=$J~U^xBbt#3Al@RcJI(wHZ9g5+_zl+;mo zGi>WI%Yt@KLOP)ugG_2IHub34{Hk>iz9=Xi^lB+up^{ut>jwM^fTvx3yilIE@Tedi%#mN)iospQiaepFtJS+D5s zJbvXp8ueUL0E4ILdA0=~S9g5BxmqjrmjxN?;hFqAIq;GU9RL{*d%%2hg9#O#xN_NcTz~a)iP$(LM_A((kd6W(TJ=Pm7eD{pFn!JrC;1W08l}$pov}-+Yi1rLe9N*z%xhr;PSV z7fJY&#%l+yHP}h4kMB9`T^*kjND+)5f`PAplVXBSB*MYM#Hl%7`|IFY4Q!FHVu)3E zXLxdR9Iw`&9%l9-3&bf$T298=bdCA1QA+@Xb=W5AuKWj%N<2>e)ta8+XZGwI5gU$4 zVt(M|eXZx!C$$8tjIUsrTIqPHX*I)BvT^CBid0WEb9#sZSgfVUelfnsseaGbCv5gh zj##jPsrx-qPKL0vTtsm=6(zsFdk_f;H7T#`Pyeq^=d)f)A20``PjdD+^l~vB&_zu6 zJYmRUE#FTQ4ZZPN-Ub#I6^g>kkC4fd>_ek3(*0;&@O$2ryuP4Ie>P1DA2pi}Kv;`9 zynXMP5zl&-uoaKj)J7G@EFZa(Od{5jQm{R3;dU>{suGjmG;A=QwuEZVHz{XXObaGj zh-vnl6fpDi0<}qArz7ykdbOU%5+Zr@{i=DnOSw0C!a}SD#H5$E+oI)|2j*|^5b z6D-dpi$pIzaJA7V`*^xcH_q1~=986rp~mJNym~68_H1>JOh85JypXSqv1mylzg9jG zj}I4ed}N{kEClhb3yaZ`C}HqS9Kb$ z9huoOHwJ=|^@fr!H-`$W4jLdjum#ln^EPN!M7{T^*y*Tx|P0BKES8@O|!K#BzFU$lBa+!(~?*zn|s{LHkzl zYGD0FQP}8E!Pt#j8HJQcwGkH!YyPF2(Fi4Dech=o5cs}K6g7-Kf3$gCNc;ko20jWB zZ^}Q!57QBFv?Qy;A51C^aU|$k+CE{DpZwMAx;yZs5#C9+HK)KeT!53A%+HnllQnFv zEvvW0+vZ~*_(DO%4C&#Q3bpLJFd8J4O37>_@{>YSnp*Uomw&u2HB7ViT#6~?5-G36 zUb*nBuX9|ON}63psd1&fmLh0RO(_n^OqT8jZ2Y-d43A|>O{w0igfy<6;+^frjzJJc zsL)}r#)h+~o~j~zN}brFVk^tJn7-F22hDs5^?~s)$?6M)cQuzQc}t1|ATwrClK-WS zB%oI8HZt$011d_nI{=|mo2Owqw1)tC8;pJS1i`m`6QTElDw4H=i(~D4IXIejpk+a6 z?fo;}lN9dl@X1k#NTSNX;beWAhjm))3AwKb30+dkcO#_Bs4!yK=jP04q-U3XA}-eg z5vT9n88lqxIJ%J97AI{LF_dHbL&xJ@KqSW=4|1Wvf7m>;aa&zyPd|5a-75YbZK#(i zJou7(xCSw2<;r(g8yi%Nv9zRO;DrgGya9dV?bZg^kCSrTdOHfl-KwnE^cpav31H7u zaNI_sDSfgWl~KUL_z1u~qM+G6=2TE30$fX!-{y}s>4#Ii zbw8MB@|y{(6|*+F#w-ER19RkT^&q*mwX4$Xu{Vb*%w#cp^zzOYr^p-g-N9VFi!!50 zCXpHiK2x`evj}s@prF42GS~aFi;nw51&>Zu^+(~AvC#y!-+J71KgETZh0s;IIii}o z$vhzBe;684s4gmWM{2a|L?;+x zO_U!J0R#msrqA42FDa7t7HfX{l&@O@uXtVg5YhSgs7tA>=uj4msb^uXGwJ-x27qFd z(ZxoSweZ(Ir-KV;lVbc1UVLV2L5hY|o7M2cd=Ah~XmZPGP&o3WyAiCI+0y1CoiXx+ z@(e|{jV8TXh6th?0p?H$Hpd$qlUjv z8%#FBv(6W5UaLsfd!YN=A4S0~s!h{lV+)-0TSdULGjF*6ZhfNtBfY)-Dj#2qBAhE= zR9#wc3*T9SzJP;`CaU-MZ8;8nBxn_KFkdzV10_26-rN5k3dh;Xb?S^j%w(d3NhDLn zy|*{S5iY!&zH-|Dsb=2prq32TL14&Q0m~+y*h8<8Le#E7BWl+i)&M#yCsH0xiY-=~ zt!E)&F-WHuLv|9PjU5>;`2BeMSBbjIWjut^ibbjc*ze)LPsY65C`u{G*Ld8r0Uwi3 z5YM=Ev#D}Z@!r;`sk=e0<3WZ-AbLYF(}%I0{eWssM*l0WY9|f5AU; ze{Pg+Eo!A7@)%N$)wE7X&Q>(JFP6s^#ezYl==n1Wld&@@iyqCgASM7!Y|FWq?fwvH zt^5hQynCX3aT8&&GI}?@f8ov^i*#`CrK#F{cxciONTkcWy$jy>jehM^4i>$wI^@O@ z-W=rTw?haf0amv(60Y~xlb*!5hpmi6#lz8|hbHi@D{Z|41F9EiJ21Sn9kH3AX3L$O z?~Uv1foeLM!&S+X>Usif^-2`2c@}E};f==1WvJ=362wq6rMLH(BBHmv$_|x*N8a`h`Z@c7OfVMEkjB7P1n@@-uK)6J)G|9Y02$4I^KpgS>ByC#pS4Y zqiLyOZy=iK=ixn5Hc)kA8uI)_^#NkbO>#HLE)!VWlKfosbH2-v&_=>G%k~dm^(s5{ zn9t@gC8<&S%;MpqEAb4;(q0J7jrnaY~FI&Wy#StZU3z z#EAV&+Qf?`H_G%+u6Hc_Thp6^dyIO3@f?=rHix%2_Q1j>qIOXs01a z*!bNRjs-C_ZGT^=O=s!P6fzr(QV$>(%*c`b&llRTKY6#59Jb zOg3Ap5`z5txE}dA_L{ot3X-bLX8X(#p<_wZotv_*^1WpnSV^ggJm&mHlTZ(AQ08s! zO%_T0(*y^=T0HQRy)Ay<#yejeQh^s@HD|_OWv9fuTZ=<_)|=mQep|fj1BHcCE&<{# zeEDupnSpL=bGdS3Zj$5guF&tr8p^NJRI=AizqNFCxhs@UG-4^Pw`(sHNj_15R55LXnG3a( z6p@=a#ke4Ao73e2Ch%U|IbBSekL9V55FpgI+?}S(zR%Z+pwhI;4k?WX>ANxS>xRhZ z#HQ@!nyf>c!^&&RcHLKoT1YT3NXq5kN`LwcW&J4-!XGx4&fCMgo@N0dA zIR9zh(UfZxW6jM(Q;*jjxtDkjOS(Nvz%36Uk4Y3{0o3X^a#$10&?cp{OfxpxnX8FO zBwutzo*5v|%qPw_ZsE)VIWb1dbrn?= z@*;eml5eHqzy-6g4wsbDrB{f*2~0oachjwCC?#&jC6*^(_ewC|1M&myJYU}*maxt{ z%y_Em>f-MD?(K?L?hT7Kwus0+63X*f`VmKT_x|RPX~o(bC~q@UGQv)Rj@Tq~OX|@k zRJt*vzI?fke27L;w7m$7m2u=>VFL>0_YgYQK4x5H`q5VV>}nDf0z0rG@%Z@Q=QFSK zb`P0?b8lfOiVd#DFJ2^C4j%IB(zVa8&z{1oa#h;|E?&V^qibFlNI3=rAxV6cx$o9qXk52t#pqv zGv`89DWP?+7ALomjA{N!+SJUsJILfdXlu=1F8}y8V=Sk%E1Q&+n>~MbB!Q{1L@ya=_5R&z3^SB}+byT~C*>8pZTvfELl< zH-`vxl=9Q{o;o;}Ai%)rxC7t^Mm3rGp2E5z;Q)^b4@YvrDFFY`2h9?NHctn>2069* z)S_(KY5B1qxnrI&)$>b#($4qeN|szE+$Q@r@~rgOCzp1jeYOua!T~Xq$SgMN?xv08 z{@88952DLy_@&C0tt26}l%Af3Hx{Q;A`r@LW|f*oRK$(BJ+H{ob?$o0l|y_(^@%8H zZ2N)^MZ(5rFl66~MPQo=NUx)rnKpHasi({2u@`rHxCmLUlQ`;>pV@^d=#@F@Cr*tj z>`gS?!N}!UP-L~5nXSzudmC+FW$0@Fe*qx#PS^I@bNSa*JM_izA2lhd`Y5EIWr^}l zL=$hlpT(QYiV)i<_d_4^rq73WCAXax7C^MX;ct9)c9m^29Lt?T4Uv;?P2#>I;o`@i zl^mL~YzNg9hw^2bjGWfX*cZGrV@z7}E&9^PC&v$K#SuRA1&iYa7nSq(nRSKCBBpQ3 zn$Ir7kD99=79^42TqzEXC0I@mmFIc*&fxDa1JV=8PIp|{tMS#a#080 zROpeHUWz*CKIrx^PziBBn!|58d>n3_68l|fWEttaXO(B+D9Jy_KGCXR!8X*CU!S&(~$HOYUGEn_I<@j1keb*!p2 zNjWCY^C-4WLkZ|DZ;%qAb-3o3Um-VpgroHVJ{eN590wlg2^)px z7U)o(S!^OML>_Q-j#a366uFK#jer=hjre^$7E8Ap+xu1>a1t>&j z24AAsDkVej=iIxq>Xz;-{L->Hglv>3*P`IG#bryAL6CB=8*-z=&FFr9*w@?H5N$s< zG#c@K)4pmxetu+Jy3wX^>D}h;iIiYBF~oUVQg0(%uKIMaOw!|O!^@*Uc81#ZGw+aJ zEmQGjO_nDrEEy3-K8gC}xYabIJT~5mH332j^>;!9CBi&~ltasQ{oeAHw`eEk3oeuD zE^!^b+g>+?idVnnrqPHMqeWVfanHFVyE?YrK@z6!Y^-0@Atlw84ma=*r;}z?S`+*V zh;EHb->^pwS*h%&XwfF86)MyfXK|D1$+i!c?P_TZ7rj%MJU+q{w3`04oS!W_My+03 zeh2%ZbLZGTz+SMF)K6X`Np$abFT__8diM<-W9i6DD14rCHi#kii(_&8W}E7!j-EF$ zfdvE`51|`BT=iy1n-Av94}Nb%%xxr3!&`m(X(93wF{EJ=TL-GAeS5ielDk5}ViHt( z=q)EuT;yQb{JIjpj^XJ}ogTBtAIqx+n3m7bsjGE2)=MhcJU>hJ; zZ!%!`dfIl^W8!PNUrGEGMTVo1QS7PFVoZ5NW}4hK$7mP!+1bvRrKO1(?`CjK8K4#& zKqCev5nb;Jm{b;HYypZR3(EretDa~4hj*0dno03?t$1iW`Q)*Sfgv}G68G3x9LUOY zw)wj;eLCn|5xBl4-k8Ouq%>l^TaoI_2HL9b2IX5rHx?v)tqrYhMvMB726IE9`4;4p zu)MW7j2lf2B*&@l!#(tJ6tzt8VH=E(K0h0Sn{+8Y*`@=Z+pF@V=c$v&x3?U(i?nS3wBf?aYvhBX zehJlav(H7C`nrsbKxVC&GDW#X+U0D2)|%U#f(zh&G=&b+^7RoAlKEia)Rb7}F_ug* z&fQ7Y*;(!tEp$S4GrVyS4M~d!)m#3ZCK+AHxZE!@N3~X5)K&d3@2Q=)ZxBV$nZ?ef zAJ&KfO>JDDS9Ju*f+fm#Cn4s$pNmFhg_l!Pd0EK}Y-)Uq^u5vs+xDGzF*9~pKm)t(+!ziTLyoJ2GYX>21AAoJdkZ*6!3VRYbg5b_SfSPLO#jEyB@ zxj&IymhA{FFX3Cf$xJf|&G>0sRwxfgQP{OF-M)vmwz)6%Ug9L8g=Iwf4(Dp;0Z8uq zuvJP_BPFGn`^OAgKeLbvy7+m&&+)88iequk=#p4|E^;NBaz)W~a_H(r827Yxk5EnznLeKb4A zS8nr0S^O986(>A-?6Q2$p|!&E`9$lM9zwocI{CEYe8GJ^uI1D+hVcfycqFciyQR}M zn;<7~cTb*Mgt7xp>HTl}0(ui5-156j&21*c6#rimbH`iK9N7M}<~#CJbZR27^fzvwB3|UN zbMnUx@q2F02$$;4jJ!Uh&HVS0>UNdXqb6%IeJcOY3Xy-B&(FZ{z(E-3Oolhwv3@77 zuU~ySmjBMI&+l(<;)sa^wy4WX@^)vs{K}lj!N5>(fb%c|gB_dJt?j29R(*>M+5fRN z-}*(pOff6x5> z)dgpMPgmFDdw#u=|90>Q$TB;-xZ1D$D_6Xj;mm*cKk!04Js$5#uh;umeHH3pWRUOs z^y%7rrR(wg&zDtxxfr{t@bT@)-P__?C4gEFtFm?`sJ0E9U&nhO9C?H1Fvs> zJAEU_DgWkx%scw%7|{D5?*JWDaiD3#lfztxC#Brnl=AFM%E?Jj`s;oEeOb<*D#hTy zY5iislMja{H!??G=eW(jR`CAmHs1WFz{?4e?^Pr_o8{eE5yS&DSiy65*;`O#l)S0P z;Isepf%%eD;p~t0TMR(5K=5rd^X@c|mw-UV0hlhpz-t;*zG0Fo4=An~7#5^}5*P?* zM2;#02E%B8fK$V03IPWL!)TTPhXYdKaP2?)``Sj`U$ZA9fTTQK{an^LB{Ts5qo1!w literal 0 HcmV?d00001 diff --git a/example/.gitignore b/example/.gitignore new file mode 100644 index 0000000..ee7c9b0 --- /dev/null +++ b/example/.gitignore @@ -0,0 +1,14 @@ +# possible output directories +**/log/ +**/output/ +**/output* + +# output logs +*.log +# output protobuf binary data files +*.pb.bin +# output figures +*.png +*.jpg +*.jpeg +*.gif diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt new file mode 100644 index 0000000..04b5bba --- /dev/null +++ b/example/CMakeLists.txt @@ -0,0 +1 @@ +# add_subdirectory(monitoring_sys_lib) diff --git a/example/monitoring_sys_lib/common.py b/example/monitoring_sys_lib/common.py new file mode 100644 index 0000000..6d9021a --- /dev/null +++ b/example/monitoring_sys_lib/common.py @@ -0,0 +1,21 @@ +import os +import sys + +script_dir = os.path.dirname(os.path.abspath(__file__)) +tests_dir = os.path.abspath(os.path.join(script_dir, os.pardir)) +root_dir = os.path.abspath(os.path.join(tests_dir, os.pardir)) +python_src_dir = os.path.join(root_dir, "src") +config_dir = os.path.join(root_dir, "config") +sys.path.append(python_src_dir) + +# === before monitoring system import === +# Keep this to enable absl to dump meaningful help message when invoked with +# --[no]help, --[no]helpfull, --[no]helpshort, and --[no]helpxml +# TODO: figure out why adding this line will make absl flags behave normal +from absl import flags as abflags +import utils.python_utils as pyutils + +# auto log_dir if not specified +if not any([p in arg for p in ["--log_dir", "--create_log_dir"] for arg in sys.argv]): + sys.argv.append(f"--log_dir={os.path.join(pyutils.get_script_dir(__file__), 'output')}") + sys.argv.append(f"--create_log_dir=True") diff --git a/example/monitoring_sys_lib/test_parser.py b/example/monitoring_sys_lib/test_parser.py new file mode 100644 index 0000000..e1a6d09 --- /dev/null +++ b/example/monitoring_sys_lib/test_parser.py @@ -0,0 +1,242 @@ +from common import * + +import utils.colored_print as cprint +from utils.logger import logging, Logger +from datetime import datetime, timezone + +import os +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument( + "--target_folder", + default=os.path.join(script_dir, "output"), + type=str, + help="Output folder to be analyzed. Default to be in /output", +) + +# only parse known args because there could be other arg parsers in the system +args = parser.parse_known_args()[0] + +search_dir = os.path.abspath(args.target_folder) +cprint.iprintf(f"Analyze on folder {search_dir}") + +# differentiate the exact output folder with output folders with several outputs signified by timestamp +outputs = { + dir_name: epoch + for dir_name, epoch in ( + (d, datetime.strptime(d, Logger().dir_time_format).replace(tzinfo=timezone.utc).timestamp()) + for d in os.listdir(search_dir) + if os.path.isdir(os.path.join(search_dir, d)) + ) +} +sorted_outputs = sorted(outputs.items(), key=lambda x: x[1], reverse=True) +assert len(sorted_outputs) > 0, "No output directories found in the specified path." + +output_folder_name, time_since_epoch = sorted_outputs[0] +output_folder = os.path.join(search_dir, output_folder_name) +cprint.iprintf(f"Preprocessing output folder: {output_folder}") + +data_filenames = [ + filename for filename in os.listdir(output_folder) if filename.endswith(".pb.bin") +] +cprint.iprintf(f"Available data files: {data_filenames}") + +import proto.cpu_metrics_pb2 as cpu_metrics_pb2 +import proto.gpu_metrics_pb2 as gpu_metrics_pb2 +import proto.disk_metrics_pb2 as disk_metrics_pb2 +import proto.proc_metrics_pb2 as proc_metrics_pb2 +import google.protobuf.message +from typing import Type, TypeVar, BinaryIO + + +def read_next_buf(msg: google.protobuf.message.Message, f: BinaryIO) -> bool: + """ + Read the next message from the file. Assumes the file is in the format where + each message is preceded by its length as an 8-byte little-endian integer. + + Args: + msg: The protobuf message to fill. + f: The file object to read from. + """ + msg_len_bytes = 8 + read_buf = f.read(msg_len_bytes) + if len(read_buf) == 0: + return False + if len(read_buf) < msg_len_bytes: + raise EOFError("Reached end of file before reading a full message length.") + msg_len = int.from_bytes(read_buf, byteorder="little") + act_len = msg.ParseFromString(f.read(msg_len)) + assert act_len == msg_len, f"Expected to read {msg_len} bytes, but got {act_len} bytes." + return True + + +T = TypeVar("T", bound=google.protobuf.message.Message) + + +def extract_time_series(data_file: str, message_type: Type[T]) -> T: + whole_msg = message_type() + with open(data_file, "rb") as f: + msg = message_type() + while read_next_buf(msg, f): + whole_msg.MergeFrom(msg) + return whole_msg + + +import utils.colored_print as cprint +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.ticker as mtick + +window_size = 5 +smooth_filter = np.ones(window_size, dtype=np.float64) / window_size +smoothing = True + +for data_filename in data_filenames: + if data_filename.startswith("CPUMeter"): + cprint.iprintf("Generating figures for CPU metrics...") + data_file = os.path.join(output_folder, data_filename) + msg = extract_time_series(data_file, cpu_metrics_pb2.CPUMetricsTimeSeries) + + nfields = len(cpu_metrics_pb2.CoreStat.DESCRIPTOR.fields) + ntslices = len(msg.metrics) + + timestamps = np.empty(ntslices, dtype=np.int64) + result = np.empty((nfields, ntslices), dtype=np.int64) + + for metric_idx, metric in enumerate(msg.metrics): + timestamps[metric_idx] = metric.timestamp + for field_idx, field in enumerate(cpu_metrics_pb2.CoreStat.DESCRIPTOR.fields): + result[field_idx, metric_idx] = getattr(metric.core_stats[0], field.name) + result_processed = np.array([result[:, i] - result[:, i - 1] for i in range(1, ntslices)]).T + + # normalize ntslices + result = result_processed / np.sum(result_processed, axis=0) * 100 + if smoothing: + # apply smoothing filter + result = np.apply_along_axis( + lambda x: np.convolve(x, smooth_filter, mode='same'), axis=1, arr=result + ) + + # remove the first timestamp as it is the base + timestamps = timestamps[ + 1: + ] # remove the first timestamp as it is not used in result_processed + timestamps = ( + timestamps - timestamps[0] + ) / 1e9 # normalize timestamps to start from 0 and convert to seconds + labels = [field.name for field in cpu_metrics_pb2.CoreStat.DESCRIPTOR.fields] + + cprint.iprintf(f"Last timestamp: {timestamps[-1]}") + + fig = plt.figure(figsize=(12, 6)) + ax = plt.subplot(111) + # print(result.shape, result_processed.shape, timestamps.shape) + ax.stackplot(timestamps, result, labels=labels) + fig.legend(loc='upper center', ncol=len(labels), bbox_to_anchor=(0.5, 0.95)) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Percentage of CPU Usage (%)") + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 15) + ax.yaxis.set_major_formatter(mtick.PercentFormatter()) + fig.savefig(f"{data_filename}.png", dpi=300, bbox_inches='tight') + cprint.iprintf(f"CPU metrics figure saved to {data_filename}.png") + + if data_filename.startswith("GPUMeter"): + cprint.iprintf("Generating figures for GPU metrics...") + colors = ["#57B4E9", "#019E73", "#E69F00", "#0072B2", "#B21000", "#5B0680"] + linestyles = ['-', '--', '-.', ':', "-"] + data_file = os.path.join(output_folder, data_filename) + msg = extract_time_series(data_file, gpu_metrics_pb2.GPUMetricsTimeSeries) + ts = np.array( + [1757098153657814454, 1757098155015994472, 1757098167621610434, 1757098249092188605] + ) + + nfields = len(msg.metrics[0].per_gpu_metrics[0].GPM_metrics_values) + ntslices = len(msg.metrics) + + timestamps = np.empty(ntslices, dtype=np.int64) + result = np.empty((nfields, ntslices), dtype=np.float64) + for metric_idx, metric in enumerate(msg.metrics): + timestamps[metric_idx] = metric.timestamp + for field_idx in range(nfields): + result[field_idx, metric_idx] = metric.per_gpu_metrics[ + gpu_id := 0 + ].GPM_metrics_values[field_idx] + # normalize ntslices + # result = result / np.sum(result, axis=0) * 100 + if smoothing: + # apply smoothing filter + result = np.apply_along_axis( + lambda x: np.convolve(x, smooth_filter, mode='same'), axis=1, arr=result + ) + ts_rectified = (ts - ts[0]) / 1e9 + timestamps = (timestamps - timestamps[0]) / 1e9 + labels = [field.name for field in gpu_metrics_pb2.GPUMetrics.DESCRIPTOR.fields] + fig = plt.figure(figsize=(12, 8)) + + cprint.iprintf(f"Last timestamp: {timestamps[-1]}") + + ax = plt.subplot(411) + ax.plot(timestamps, result[0], label="SM Utilization", color=colors[0], linewidth=1) + ax.plot(timestamps, result[1], label="SM Occupancy", color=colors[1], linewidth=1) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Utilization (%)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 20) + ax.axhline(y=100, color='black', linestyle='--', linewidth=1) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + for t in ts_rectified: + ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax = plt.subplot(412) + ax.plot(timestamps, result[2], label="PCIe TX", color=colors[0], linewidth=1) + ax.plot(timestamps, result[3], label="PCIe RX", color=colors[1], linewidth=1) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Throughput (MB/s)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 150) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + for t in ts_rectified: + ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax = plt.subplot(413) + ax.plot( + timestamps, + result[4], + label="DRAM BW Utilization", + color=colors[0], + linestyle=linestyles[0], + ) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Utilization (%)") + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 105) + ax.legend(loc='upper right', ncols=10) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + for t in ts_rectified: + ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax = plt.subplot(414) + ax.plot(timestamps, result[5], label="Integer Utilization", color=colors[0], linewidth=1) + ax.plot(timestamps, result[6], label="FP16 Utilization", color=colors[1], linewidth=1) + ax.plot(timestamps, result[7], label="FP32 Utilization", color=colors[2], linewidth=1) + ax.plot(timestamps, result[8], label="FP64 Utilization", color=colors[3], linewidth=1) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Utilization (%)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 5) + ax.axhline(y=100, color='black', linestyle='--', linewidth=1) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + for t in ts_rectified: + ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + fig.savefig(f"{data_filename}.png", dpi=300, bbox_inches='tight') + cprint.iprintf(f"GPU metrics figure saved to {data_filename}.png") diff --git a/example/monitoring_sys_lib/test_parser_new.py b/example/monitoring_sys_lib/test_parser_new.py new file mode 100644 index 0000000..c676163 --- /dev/null +++ b/example/monitoring_sys_lib/test_parser_new.py @@ -0,0 +1,708 @@ +from common import * + +from datetime import datetime, timezone + +import os +import itertools + +include_path = os.path.join(os.path.dirname(__file__), "..", "..", "src", "proto") +include_path = os.path.abspath(include_path) +print(include_path) +# print(os.abspath(path=include_path)) +sys.path.append(include_path) + +import cpu_metrics_pb2 as cpu_metrics_pb2 +import gpu_metrics_pb2 as gpu_metrics_pb2 +import disk_metrics_pb2 as disk_metrics_pb2 +import proc_metrics_pb2 as proc_metrics_pb2 +import mem_metrics_pb2 as mem_metrics_pb2 +import google.protobuf.message +from typing import Type, TypeVar, BinaryIO + +import utils.colored_print as cprint +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.ticker as mtick + +search_dir = os.path.join(os.path.dirname(__file__), "output") +search_dir = os.path.abspath( + os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir, "src", "output") +) +print(f"Searching output folders in: {search_dir}") +outputs = { + dir_name: epoch + for dir_name, epoch in ( + # (d, datetime.strptime(d, "%Y-%m-%dT%H:%M:%S%z").replace(tzinfo=timezone.utc).timestamp()) + # for d in os.listdir(search_dir) + # if os.path.isdir(os.path.join(search_dir, d)) + (d, 0) + for d in os.listdir(search_dir) + ) +} +sorted_outputs = sorted(outputs.items(), key=lambda x: x[1], reverse=True) +assert len(sorted_outputs) > 0, "No output directories found in the specified path." +# print(sorted_outputs) + +# x_pos = [] +# # REPLACE FILENAME WITH CORRESPONDING GRAPHING OUTPUT +# filename = "src/output/2025-10-18T18:43:27-0500" +# with open(f"../../../{filename}/time_break_down.txt", 'r') as file: +# for line in file: +# s = (str)(line) +# x_pos.append(s) + +# start_time = 0 + + +# def draw_lines(ax, x_pos): +# for i in range(len(x_pos)): +# s = x_pos[i] +# index = s.index(",") +# tag = s[0:index] +# time = (int)(s[index + 2 : len(s)]) / (10**9) + +# if i == 0: +# start_time = time + + +# ax.axvline(x=time - start_time, color='r', linestyle='--') +# ax.text(time - start_time, 0, tag, fontsize=6, rotation=90) +def draw_lines(ax, x_pos, min_spacing=0.5): + """ + Draw red dashed vertical lines with non-overlapping text labels. + If two lines are close (within min_spacing seconds), their text is shown + at different y-levels to avoid overlap. + """ + # --- Parse lines safely --- + parsed = [] + for s in x_pos: + try: + idx = s.index(",") + tag = s[:idx].strip() + t = int(s[idx + 1 :].strip()) / 1e9 + parsed.append((tag, t)) + except Exception: + continue + if not parsed: + return + print(f"Drawing {parsed} event lines.") + start_time = parsed[0][1] + y_min, y_max = ax.get_ylim() + y_range = y_max - y_min + + # Vertical positions to cycle through for close events + y_levels = [ + y_min + 0.02 * y_range, + y_min + 0.08 * y_range, + y_min + 0.14 * y_range, + y_min + 0.20 * y_range, + ] + + last_x = -float("inf") + level_idx = 0 + + for tag, t in parsed: + xpos = t - start_time + + # --- Draw vertical line --- + ax.axvline(x=xpos, color='red', linestyle='--', linewidth=1.0, alpha=0.8) + + # --- Adjust text vertical level if lines are close --- + if xpos - last_x < min_spacing: + level_idx = (level_idx + 1) % len(y_levels) + else: + level_idx = 0 + y_text = y_levels[level_idx] + last_x = xpos + # print(y_max) + # --- Draw label --- + ax.text( + xpos + 0.05, # small horizontal offset + 1, + tag, + fontsize=7, + rotation=60, + color='black', + va='bottom', + ha='left', + backgroundcolor='white', + bbox=dict(facecolor='white', edgecolor='none', pad=0.4, alpha=0.7), + clip_on=True, + ) + + # ax.set_ylim(y_min, y_max) + + +def read_next_buf(msg: google.protobuf.message.Message, f: BinaryIO) -> bool: + """ + Read the next message from the file. Assumes the file is in the format where + each message is preceded by its length as an 8-byte little-endian integer. + + Args: + msg: The protobuf message to fill. + f: The file object to read from. + """ + msg_len_bytes = 8 + read_buf = f.read(msg_len_bytes) + if len(read_buf) == 0: + return False + if len(read_buf) < msg_len_bytes: + raise EOFError("Reached end of file before reading a full message length.") + msg_len = int.from_bytes(read_buf, byteorder="little") + act_len = msg.ParseFromString(f.read(msg_len)) + assert act_len == msg_len, f"Expected to read {msg_len} bytes, but got {act_len} bytes." + return True + + +T = TypeVar("T", bound=google.protobuf.message.Message) + + +def extract_time_series(data_file: str, message_type: Type[T]) -> T: + whole_msg = message_type() + with open(data_file, "rb") as f: + msg = message_type() + while read_next_buf(msg, f): + whole_msg.MergeFrom(msg) + return whole_msg + + +window_size = 5 +smooth_filter = np.ones(window_size, dtype=np.float64) / window_size +smoothing = False + +for output_run in sorted_outputs: + + output_folder_name, time_since_epoch = output_run + output_folder = os.path.join(search_dir, output_folder_name) + print(f"Preprocessing output folder: {output_folder}") + + data_file_names = [ + filename for filename in os.listdir(output_folder) if filename.endswith(".pb.bin") + ] + # data_file_names = [filename for filename in os.listdir(output_folder) if filename.endswith(".data")] + print(data_file_names) + timebreak_file = os.path.join(output_folder, "time_break_down.txt") + x_pos = [] + if not os.path.exists(timebreak_file): + cprint.iprintf(f"Time break down file not found: {timebreak_file}") + continue + with open(timebreak_file, 'r') as file: + for line in file: + s = (str)(line) + x_pos.append(s) + + start_time = 0 + + for data_file_name in data_file_names: + if data_file_name.startswith("CPUMeter"): + colors = [ + "#57B4E9", + "#019E73", + "#E69F00", + "#FFFFFF", + "#B11000", + "#5B2680", + "#56B449", + "#329E73", + "#463F80", + ] + cprint.iprintf("Generating figures for CPU metrics...") + data_file = os.path.join(output_folder, data_file_name) + msg = extract_time_series(data_file, cpu_metrics_pb2.CPUMetricsTimeSeries) + + nfields = len(cpu_metrics_pb2.CoreStat.DESCRIPTOR.fields) + ntslices = len(msg.metrics) + + timestamps = np.empty(ntslices, dtype=np.int64) + result = np.empty((nfields, ntslices), dtype=np.int64) + + for metric_idx, metric in enumerate(msg.metrics): + timestamps[metric_idx] = metric.timestamp + for field_idx, field in enumerate(cpu_metrics_pb2.CoreStat.DESCRIPTOR.fields): + result[field_idx, metric_idx] = getattr(metric.core_stats[0], field.name) + result_processed = np.array( + [result[:, i] - result[:, i - 1] for i in range(1, ntslices)] + ).T + + # normalize ntslices + result = result_processed / np.sum(result_processed, axis=0) * 100 + if smoothing: + # apply smoothing filter + result = np.apply_along_axis( + lambda x: np.convolve(x, smooth_filter, mode='same'), axis=1, arr=result + ) + + # remove the first timestamp as it is the base + timestamps = timestamps[ + 1: + ] # remove the first timestamp as it is not used in result_processed + timestamps = ( + timestamps - timestamps[0] + ) / 1e9 # normalize timestamps to start from 0 and convert to seconds + labels = [field.name for field in cpu_metrics_pb2.CoreStat.DESCRIPTOR.fields] + + cprint.iprintf(f"Last timestamp: {timestamps[-1]}") + + fig = plt.figure(figsize=(10, 4)) + ax = plt.subplot(111) + + # vertical lines + draw_lines(ax, x_pos) + + ax.stackplot(timestamps, result, colors=colors, labels=labels) + fig.legend(loc='upper center', ncol=len(labels) // 2, bbox_to_anchor=(0.5, 1.05)) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Percentage of CPU Usage (%)") + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 100) + ax.yaxis.set_major_formatter(mtick.PercentFormatter()) + # fig.savefig(f"{data_file_name}_{output_folder_name}.png", dpi=300, bbox_inches='tight') + save_path = os.path.join(output_folder, f"{data_file_name}.png") + fig.savefig(save_path, dpi=300, bbox_inches='tight') + cprint.iprintf(f"CPU metrics figure saved to {data_file_name}.png") + plt.close(fig) + + if data_file_name.startswith("GPUMeter"): + cprint.iprintf("Generating figures for GPU metrics...") + colors = ["#57B4E9", "#019E73", "#E69F00", "#0072B2", "#B21000", "#5B0680"] + linestyles = ['-', '--', '-.', ':', "-"] + data_file = os.path.join(output_folder, data_file_name) + msg = extract_time_series(data_file, gpu_metrics_pb2.GPUMetricsTimeSeries) + ts = np.array( + [1757098153657814454, 1757098155015994472, 1757098167621610434, 1757098249092188605] + ) + # print(msg.metrics[100]) + ntslices = len(msg.metrics) + if ntslices < 1: + cprint.iprintf("No GPU metrics data found, skipping...") + continue + nfields = len(msg.metrics[0].per_gpu_metrics[0].GPM_metrics_values) + print(msg.metrics[0]) + + timestamps = np.empty(ntslices, dtype=np.int64) + result = np.empty((nfields, ntslices), dtype=np.float64) + mem = np.zeros((2, 128, ntslices), dtype=np.int64) + for metric_idx, metric in enumerate(msg.metrics): + timestamps[metric_idx] = metric.timestamp + for field_idx in range(nfields): + result[field_idx, metric_idx] = metric.per_gpu_metrics[ + gpu_id := 0 + ].GPM_metrics_values[field_idx] + if metric.per_gpu_metrics[0].per_process_gpu_metrics: + for id, proc in enumerate(metric.per_gpu_metrics[0].per_process_gpu_metrics): + if id < mem.shape[1]: + mem[0, id, metric_idx] = proc.used_gpu_memory + # else: + # mem[0, 0, metric_idx] = 0 + if metric.per_gpu_metrics[1].per_process_gpu_metrics: + for id, proc in enumerate(metric.per_gpu_metrics[1].per_process_gpu_metrics): + if id < mem.shape[1]: + mem[1, id + 2, metric_idx] = proc.used_gpu_memory + # normalize ntslices + # result = result / np.sum(result, axis=0) * 100 + # if smoothing: + # # apply smoothing filter + # result = np.apply_along_axis( + # lambda x: np.convolve(x, smooth_filter, mode='same'), axis=1, arr=result + # ) + ts_rectified = (ts - ts[0]) / 1e9 + timestamps = (timestamps - timestamps[0]) / 1e9 + labels = [field.name for field in gpu_metrics_pb2.GPUMetrics.DESCRIPTOR.fields] + fig = plt.figure(figsize=(12, 12)) + + cprint.iprintf(f"Last timestamp: {timestamps[-1]}") + + plt.subplots_adjust(hspace=0.3) + + ax = plt.subplot(511) + + # vertical lines + draw_lines(ax, x_pos) + + ax.locator_params(axis='x', nbins=10) + ax.plot(timestamps, result[0], label="SM Utilization", color=colors[0], linewidth=1) + ax.plot(timestamps, result[1], label="SM Occupancy", color=colors[1], linewidth=1) + # ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Utilization (%)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 100) + # ax.axhline(y=100, color='black', linestyle='--', linewidth=1) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + # for t in ts_rectified: + # ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax.xaxis.set_major_locator(mtick.MaxNLocator(nbins=6)) + + ax = plt.subplot(512) + + # vertical lines + draw_lines(ax, x_pos) + + plt.locator_params(axis='x', nbins=10) + ax.plot(timestamps, result[2], label="PCIe to CPU", color=colors[0], linewidth=1) + ax.plot(timestamps, result[3], label="PCIe to GPU", color=colors[1], linewidth=1) + # ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Throughput (MB/s)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + print("max timestamp:", timestamps[-1]) + ax.set_ylim(0, 4000) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + # for t in ts_rectified: + # ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax.xaxis.set_major_locator(mtick.MaxNLocator(nbins=6)) + + ax = plt.subplot(513) + + # vertical lines + draw_lines(ax, x_pos) + + plt.locator_params(axis='x', nbins=10) + ax.plot( + timestamps, + result[4], + label="DRAM BW Utilization", + color=colors[0], + linestyle=linestyles[0], + ) + ax.plot( + timestamps, + result[5], + label="INTEGER_UTIL", + color=colors[1], + linestyle=linestyles[0], + ) + ax.plot( + timestamps, + result[9], + label="ANY_TENSOR_UTIL", + color=colors[2], + linestyle=linestyles[0], + ) + ax.plot( + timestamps, + result[10], + label="GRAPHICS_UTIL", + color=colors[3], + linestyle=linestyles[0], + ) + # ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Utilization (%)") + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 105) + ax.legend(loc='upper right', ncols=10) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + # for t in ts_rectified: + # ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax.xaxis.set_major_locator(mtick.MaxNLocator(nbins=6)) + + """ + ax = plt.subplot(514) + + # vertical lines + draw_lines(ax, x_pos) + + ax.plot( + timestamps, result[5], label="Integer Utilization", color=colors[0], linewidth=1 + ) + ax.plot(timestamps, result[6], label="FP16 Utilization", color=colors[1], linewidth=1) + ax.plot(timestamps, result[7], label="FP32 Utilization", color=colors[2], linewidth=1) + ax.plot(timestamps, result[8], label="FP64 Utilization", color=colors[3], linewidth=1) + # ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Utilization (%)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 100) + # ax.axhline(y=100, color='black', linestyle='--', linewidth=1) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + # for t in ts_rectified: + # ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax.xaxis.set_major_locator(mtick.MaxNLocator(nbins=6)) + + # fig.savefig(f"{data_file_name}_{output_folder_name}.png", dpi=300, bbox_inches='tight') + save_path = os.path.join(output_folder, f"{data_file_name}.png") + fig.savefig(save_path, dpi=300, bbox_inches='tight') + cprint.iprintf(f"GPU metrics figure saved to {data_file_name}.png") + """ + + ax = plt.subplot(514) + + # vertical lines + draw_lines(ax, x_pos) + + plt.locator_params(axis='x', nbins=10) + sum = np.zeros(ntslices, dtype=np.int64) + for iter in range(mem.shape[1]): + # skip if all zero + if np.all(mem[0, iter, :] == 0): + continue + sum = sum + mem[0, iter, :] / (1024 * 1024 * 1024) + ax.plot( + timestamps, + sum, + # label=f"Process {iter} Mem Usage", + linewidth=1, + ) + + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Memory (GB)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 100) + ax.axhline(y=94, color='black', linestyle='--', linewidth=1) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + # for t in ts_rectified: + # ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax.xaxis.set_major_locator(mtick.MaxNLocator(nbins=6)) + + ax = plt.subplot(515) + + # vertical lines + draw_lines(ax, x_pos) + + plt.locator_params(axis='x', nbins=10) + sum = np.zeros(ntslices, dtype=np.int64) + for iter in range(mem.shape[1]): + # skip if all zero + if np.all(mem[1, iter, :] == 0): + continue + sum = sum + mem[1, iter, :] / (1024 * 1024 * 1024) + ax.plot( + timestamps, + sum, + # label=f"Process {iter} Mem Usage", + linewidth=1, + ) + # ax.plot( + # timestamps, + # mem[0] / (1024 * 1024 * 1024), + # label="VectorDB", + # color=colors[0], + # linewidth=1, + # ) + # ax.plot( + # timestamps, + # (mem[1] + mem[0] + mem[2]) / (1024 * 1024 * 1024), + # label="Generation Model", + # color=colors[1], + # linewidth=1, + # ) + # ax.plot( + # timestamps, + # (mem[2] + mem[0]) / (1024 * 1024 * 1024), + # label="Rerank Model", + # color=colors[2], + # linewidth=1, + # ) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Memory (GB)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 100) + ax.axhline(y=94, color='black', linestyle='--', linewidth=1) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + # for t in ts_rectified: + # ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + + ax.xaxis.set_major_locator(mtick.MaxNLocator(nbins=6)) + + # fig.savefig(f"{data_file_name}_{output_folder_name}.png", dpi=300, bbox_inches='tight') + save_path = os.path.join(output_folder, f"{data_file_name}.png") + fig.savefig(save_path, dpi=300, bbox_inches='tight') + cprint.iprintf(f"GPU metrics figure saved to {data_file_name}.png") + plt.close(fig) + + # if data_file_name.startswith("DiskMeter"): + + if data_file_name.startswith("DiskMeter"): + cprint.iprintf("Generating figures for Disk metrics...") + colors = ["#57B4E9", "#019E73", "#E69F00", "#0072B2", "#B21000", "#5B0680"] + linestyles = ['-', '--', '-.', ':', "-"] + data_file = os.path.join(output_folder, data_file_name) + msg = extract_time_series(data_file, disk_metrics_pb2.DiskMetricsTimeSeries) + ts = np.array( + [1757098153657814454, 1757098155015994472, 1757098167621610434, 1757098249092188605] + ) + # print(msg.metrics[100]) + # print(msg.metrics[101]) + # print(msg.metrics[1010]) + nfields = 2 # read/write + ntslices = len(msg.metrics) + # print(nfields,ntslices) + # labels = [field.name for field in disk_metrics_pb2.DiskMetrics.disk_metrics.DESCRIPTOR.fields] + # print(labels) + print("======================================") + # print(msg.metrics[0].disk_metrics) + # print(type(msg.metrics[0].disk_metrics[0])) + # print(msg.metrics[0].disk_metrics[0].reads_completed) + # exit(0) + if ntslices < 1: + cprint.iprintf("No disk metrics data found, skipping...") + continue + dis_count = len(msg.metrics[0].disk_metrics) + timestamps = np.empty(ntslices, dtype=np.int64) + temp = np.empty((dis_count, nfields, ntslices), dtype=np.float64) + result = np.empty((dis_count, nfields, ntslices), dtype=np.float64) + # start_r = metric.disk_metrics[0].sectors_read + # start_w = metric.disk_metrics[0].sectors_written + for metric_idx, metric in enumerate(msg.metrics): + timestamps[metric_idx] = metric.timestamp + for disk_id in range(dis_count): + temp[disk_id, 0, metric_idx] = metric.disk_metrics[disk_id].sectors_read + temp[disk_id, 1, metric_idx] = metric.disk_metrics[disk_id].sectors_written + # print(temp[:, metric_idx]) + if metric_idx == 0: + result[disk_id, 0, metric_idx] = 0 + result[disk_id, 1, metric_idx] = 0 + else: + result[disk_id, 0, metric_idx] = ( + (temp[disk_id, 0, metric_idx] - temp[disk_id, 0, metric_idx - 1]) + / (timestamps[metric_idx] - timestamps[metric_idx - 1]) + * 1e9 + * 0.5 + / 1024 + ) + result[disk_id, 1, metric_idx] = ( + (temp[disk_id, 1, metric_idx] - temp[disk_id, 1, metric_idx - 1]) + / (timestamps[metric_idx] - timestamps[metric_idx - 1]) + * 1e9 + * 0.5 + / 1024 + ) + # print(result[:, metric_idx]) + # for field_idx in range(nfields): + # result[field_idx, metric_idx] = metric.per_gpu_metrics[ + # gpu_id := 0 + # ].GPM_metrics_values[field_idx] + # normalize ntslices + # result = result / np.sum(result, axis=0) * 100 + # print(result) + # print(result[1]) + # if smoothing: + # # apply smoothing filter + # result = np.apply_along_axis( + # lambda x: np.convolve(x, smooth_filter, mode='same'), axis=1, arr=result + # ) + ts_rectified = (ts - ts[0]) / 1e9 + timestamps = (timestamps - timestamps[0]) / 1e9 + # labels = [field.name for field in gpu_metrics_pb2.GPUMetrics.DESCRIPTOR.fields] + fig = plt.figure(figsize=(12, 8)) + + cprint.iprintf(f"Start times: {timestamps[0]} Last timestamp: {timestamps[-1]}") + + ax = fig.add_subplot(111) + + # vertical lines + draw_lines(ax, x_pos) + for id in range(dis_count): + ax.plot(timestamps, result[id, 0], label="Read", color=colors[0], linewidth=1) + ax.plot(timestamps, result[id, 1], label="Write", color=colors[1], linewidth=1) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Disk IO Bandwidth (MB/s)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(0, 6000) + # ax.axhline(y=100, color='black', linestyle='--', linewidth=1) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + # for t in ts_rectified: + # ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + ax.xaxis.set_major_locator(mtick.MaxNLocator(nbins=6)) + # fig.savefig(f"{data_file_name}_{output_folder_name}.png", dpi=300, bbox_inches='tight') + save_path = os.path.join(output_folder, f"{data_file_name}.png") + fig.savefig(save_path, dpi=300, bbox_inches='tight') + cprint.iprintf(f"Disk metrics figure saved to {data_file_name}.png") + plt.close(fig) + + if data_file_name.startswith("MemMeter"): + cprint.iprintf("Generating figures for Mmeory metrics...") + colors = ["#57B4E9", "#019E73", "#E69F00", "#0072B2", "#B21000", "#5B0680"] + linestyles = ['-', '--', '-.', ':', "-"] + data_file = os.path.join(output_folder, data_file_name) + msg = extract_time_series(data_file, mem_metrics_pb2.MemMetricsTimeSeries) + ts = np.array( + [1757098153657814454, 1757098155015994472, 1757098167621610434, 1757098249092188605] + ) + # print(msg) + # exit(0) + nfields = 4 + ntslices = len(msg.metrics) + # print(nfields,ntslices) + print("---------------------------------------------------------") + # print(ntslices) + # print(msg.metrics[0]) + # print(msg.metrics[0].meminfo_metrics.basic_metrics.mem_total) + # exit(0) + if len(msg.metrics) == 0: + continue + timestamps = np.empty(ntslices, dtype=np.int64) + start_time = msg.metrics[0].timestamp + result = np.empty((nfields, ntslices), dtype=np.float64) + for metric_idx, metric in enumerate(msg.metrics): + timestamps[metric_idx] = metric_idx / 10 + # print(timestamps[metric_idx]) + result[0, metric_idx] = metric.meminfo_metrics.basic_metrics.mem_free / ( + 1024 * 1024 + ) + result[1, metric_idx] = metric.meminfo_metrics.basic_metrics.mem_available / ( + 1024 * 1024 + ) + result[2, metric_idx] = metric.meminfo_metrics.kernel_cache_metrics.buffers / ( + 1024 * 1024 + ) + result[3, metric_idx] = metric.meminfo_metrics.kernel_cache_metrics.cached / ( + 1024 * 1024 + ) + + ts_rectified = (ts - ts[0]) / 1e9 + + fig = plt.figure(figsize=(12, 3)) + cprint.iprintf(f"Last timestamp: {timestamps[-1]}") + ax = fig.add_subplot(111) + + # vertical lines + draw_lines(ax, x_pos) + + ax.plot(timestamps, result[0], label="Free", color=colors[0], linewidth=2) + ax.plot(timestamps, result[1], label="Available", color=colors[1], linewidth=2) + ax.plot(timestamps, result[2], label="Buffered", color=colors[2], linewidth=2) + ax.plot(timestamps, result[3], label="Cached", color=colors[4], linewidth=2) + ax.set_xlabel("Timestamp (s)") + ax.set_ylabel("Disk IO (%)") + ax.legend(loc='upper right', ncols=10) + ax.set_xlim(0, timestamps[-1]) + ax.set_ylim(100, 1500) + ax.axhline(y=100, color='black', linestyle='--', linewidth=1) + ax.yaxis.grid(color='lightgray', linestyle='--', linewidth=0.7) + # for t in ts_rectified: + # ax.axvline(x=t, color='black', linestyle='--', linewidth=1) + ax.set_xticks(range(0, int(timestamps[-1]) + 1, 5)) + ax.xaxis.set_major_locator(mtick.MaxNLocator(nbins=6)) + # fig.savefig(f"{data_file_name}_{output_folder_name}.png", dpi=300, bbox_inches='tight') + save_path = os.path.join(output_folder, f"{data_file_name}.png") + fig.savefig(save_path, dpi=300, bbox_inches='tight') + cprint.iprintf(f"Mem metrics figure saved to {data_file_name}.png") + plt.close(fig) + + if data_file_name.startswith("ProcMeter"): + cprint.iprintf("Generating figures for proc metrics...") + colors = ["#57B4E9", "#019E73", "#E69F00", "#0072B2", "#B21000", "#5B0680"] + linestyles = ['-', '--', '-.', ':', "-"] + data_file = os.path.join(output_folder, data_file_name) + msg = extract_time_series(data_file, proc_metrics_pb2.ProcMetricsTimeSeries) + # ts = np.array( + # [1757098153657814454, 1757098155015994472, 1757098167621610434, 1757098249092188605] + # ) + print("++++++++++++++++++++++++++++++++++++++++++++++=") + print(msg.metrics[0]) diff --git a/example/monitoring_sys_lib/test_run.py b/example/monitoring_sys_lib/test_run.py new file mode 100644 index 0000000..158bbd9 --- /dev/null +++ b/example/monitoring_sys_lib/test_run.py @@ -0,0 +1,34 @@ +# === path resolution === +from common import * + +# === monitoring system import === +from monitoring_sys import MSys + +# === after monitoring system import === +from utils.logger import logging, Logger + +# === start of normal code === +from monitoring_sys.config_parser.msys_config_parser import MSysConfig, StaticEnv, MacroTranslator +import utils.colored_print as cprint +import time + +# reuse logger output folder +output_path = os.path.join(Logger().log_dirpath) + + +input_config_file = os.path.join(config_dir, "monitor", "example_config.yaml") +with open(input_config_file, "r") as fin: + translated_config = MacroTranslator(StaticEnv.get_static_env("global")).translate(fin).read() +with open(os.path.join(output_path, "translated_msys_config.yaml"), "w") as fout: + fout.write(translated_config) +monitor = MSys(MSysConfig.from_yaml_string(translated_config)) + +cprint.iprintf("Start test run") +ret = monitor.test_run() +monitor.report_status(verbose=False, detail=True) +cprint.iprintf(f"Test run finished w/ ret code {ret}") +if not ret: + exit(1) + +with monitor: + time.sleep(5) diff --git a/monitoring_sys/.gitignore b/monitoring_sys/.gitignore new file mode 100644 index 0000000..04a6e8d --- /dev/null +++ b/monitoring_sys/.gitignore @@ -0,0 +1 @@ +generated/* \ No newline at end of file diff --git a/monitoring_sys/CMakeLists.txt b/monitoring_sys/CMakeLists.txt new file mode 100644 index 0000000..9d2f568 --- /dev/null +++ b/monitoring_sys/CMakeLists.txt @@ -0,0 +1,120 @@ +include(msys_defs.cmake) + +# assumes variable PYTHON_SRC_DIR is already set to the path of the python source directory +assert_valid_path(PYTHON_SRC_DIR) +assert_valid_path(THIRD_PARTY_DIR) + +# === Module name === +set(MSYS_MODULE_NAME libmsys) + +# === PYBIND11 related === +find_program(PYBIND11_STUBGEN pybind11-stubgen) +message(STATUS "Found pybind11-stubgen at ${PYBIND11_STUBGEN}") +set(PYBIND11_MKDOC_MODULE_NAME pybind11_mkdoc) +set(PYBIND11_MKDOC_MODULE_PATH ${THIRD_PARTY_DIR}/pybind11_mkdoc) + +# check for libclang compatibility before aborting because of pybind11-mkdoc not found +get_libclang_sharedlib_version(LIBCLANG_VERSION) +if(NOT LIBCLANG_VERSION STREQUAL "") + add_py3_pkg_dependencies(${INTERFACE_GEN_TARGET} + PKG_REQUIREMENTS clang==${LIBCLANG_VERSION} + ) + add_py3_pkg_requirements("clang==${LIBCLANG_VERSION}" ENV_SPECIFIC) +else() + message(WARNING "Failed to auto infer libclang version and set requirement for ${MSYS_TARGET}.") +endif() + +if (NOT PYBIND11_STUBGEN) + message(WARNING + "pybind11-stubgen not found, refuse to generate ${MSYS_MODULE_NAME} targets") + return() +endif() + +# === Setup module build === +# pybind11_add_module is analogous to cmake add_library calls +pybind11_add_module(${MSYS_MODULE_NAME} + ${MSYS_SOURCES} +) +cxx_setup_target(${MSYS_MODULE_NAME} + INCLUDES ${MSYS_INCLUDES} + DEPENDS ${MSYS_DEPENDS} + COPTIONS -flto=auto -Wall -Wextra -Werror # treat every warning as an error + TARGET MSYS_TARGET) +# avoid weird lto-wrapper warning on serial compilation +# Note: seems like -Wl,-flto=auto does not work, use direct -flto instead +target_link_options(${MSYS_TARGET} PUBLIC -flto=auto) +# specify the module name for the python module through macro definition +target_compile_definitions(${MSYS_TARGET} PUBLIC MSYS_MODNAME=${MSYS_MODULE_NAME}) + +# === Set up the python module === +# Conda environment does not provide Development.Module and corresponding cmake files to get +# ${Python3_EXTENSION_SUFFIX} is missing. Use python interpreter to get the suffix instead. +# REVIEW: Maybe find a way to do this logic to use Development.Module +if (NOT DEFINED Python3_EXTENSION_SUFFIX OR "${Python3_EXTENSION_SUFFIX}" STREQUAL "") + message(STATUS "Python3_EXTENSION_SUFFIX is not found, check with interpreter") + execute_process( + COMMAND ${Python3_EXECUTABLE} -c + "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))" + OUTPUT_VARIABLE Python3_EXTENSION_SUFFIX_RET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + message(STATUS "Using Python3_EXTENSION_SUFFIX=${Python3_EXTENSION_SUFFIX_RET}") + set(Python3_EXTENSION_SUFFIX ${Python3_EXTENSION_SUFFIX_RET}) +endif() + +# setup the CPython shared lib extension +set_target_properties(${MSYS_TARGET} PROPERTIES + PREFIX "" + SUFFIX "${Python3_EXTENSION_SUFFIX}" +) + +# === Generate rich interface === +# enable stubgen with more information gathered from pybind11-mkdoc +set(INTERFACE_GEN_TARGET msys_gen_interface) +# target interface file specifications +set(INTERFACE_HEADER_NAME "pybind11_defs.h") +set(INTERFACE_HEADER_DIR ${CMAKE_CURRENT_LIST_DIR}/generated/interface) +set(INTERFACE_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/interface.cc) +# make interface generation a pre-build target +set(INTERFACE_PATH ${INTERFACE_HEADER_DIR}/${INTERFACE_HEADER_NAME}) +add_custom_command( + OUTPUT ${INTERFACE_PATH} + COMMAND ${CMAKE_COMMAND} -E make_directory ${INTERFACE_HEADER_DIR} + # generate rich interface, redir stderr to devnull to avoid confusing fatal error message + # of cannot find include files when it tries to resolve all dependencies that are not + # actually needed + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PYBIND11_MKDOC_MODULE_PATH} + ${Python3_EXECUTABLE} -m ${PYBIND11_MKDOC_MODULE_NAME} + -o ${INTERFACE_PATH} ${INTERFACE_SOURCES} 2>/dev/null + DEPENDS ${INTERFACE_SOURCES} +) +add_custom_target(${INTERFACE_GEN_TARGET} + # make sure the interface header is generated before building the target + DEPENDS ${INTERFACE_PATH} + COMMENT "Generate rich interface for ${MSYS_MODULE_NAME}" +) +# enable rich interface by specifying macro so c++ program knows it +target_compile_definitions(${MSYS_TARGET} PUBLIC PYBIND11_RICH_INTERFACE) +add_dependencies(${MSYS_TARGET} ${INTERFACE_GEN_TARGET}) + +# === Install the module === +# symlink the library to ${PYTHON_SRC_DIR}/monitoring_sys folder and generate stub for the lib after +# build complete +set(MODULE_DESTINATION ${PYTHON_SRC_DIR}/monitoring_sys) +add_custom_target(${MSYS_MODULE_NAME}_pymod + # symlink the shared lib to target directory + COMMAND ${CMAKE_COMMAND} -E create_symlink + $ + ${MODULE_DESTINATION}/$ + COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --cyan + "Symlink-ed lib in ${MODULE_DESTINATION}/$" + # generate stub using pybind11-stubgen + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${MODULE_DESTINATION} + ${PYBIND11_STUBGEN} ${MSYS_MODULE_NAME} --output-dir "${MODULE_DESTINATION}" + COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --cyan "Stub generated in ${MODULE_DESTINATION}" + + WORKING_DIRECTORY $ + DEPENDS ${MSYS_MODULE_NAME} + COMMENT "Build libmsys and position output product to src/monitoring_sys" + VERBATIM +) diff --git a/monitoring_sys/README.md b/monitoring_sys/README.md new file mode 100644 index 0000000..a17747b --- /dev/null +++ b/monitoring_sys/README.md @@ -0,0 +1,179 @@ +# Monitoring System (MSys) + +## Installation + +### Dependencies + +1. A C++20 compatible compiler, e.g., GCC 12.1.0 or later +([Steps](#c-20-compatible-compiler-installation)). +2. ProtocolBuffer compiler and runtime library, currently using version 30.2 +([Steps](#protobuf-installation)). + +#### Before Started + +Using virtual environment for python is a good practice. In this doc, all setup code example use +`conda` as the example environment manager. + +What should be done in the virtual environment: + +- Install python-specific modules +- Configuring & compiling the monitoring_sys module +- Run any python code that uses the monitoring_sys module + +What should **NOT** be done in virtual environment + +- Build required libraries (e.g., protobuf) in virtual environment for a user-wide or system-wide +installation! + +This is because that the host python and conda python might be different in python version, +installed lib, etc. + +#### C++ 20 Compatible Compiler Installation + +Check if system compiler already have the capability, if so, this step can be skipped. + +To install a C++ 20 compatible compiler in the virtual environment, for example, `gcc=12.1.0`, run + +```bash +conda install -c conda-forge gcc=12.1.0 +``` + +#### Protobuf Installation + +Install protobuf compiler and runtime library (modified from +[PROTOBUF_CMAKE](https://github.com/protocolbuffers/protobuf/blob/main/cmake/README.md)). +Currently, we are using version `v30.2`. + +```bash +# Clone the protobuf repository +git clone https://github.com/protocolbuffers/protobuf.git +cd protobuf +git submodule update --init --recursive +git checkout v30.2 +# Make & Install to ~/.local +mkdir build && cd build +cmake .. -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DBUILD_SHARED_LIBS=ON \ + -Dprotobuf_BUILD_SHARED_LIBS=ON \ + -Dprotobuf_BUILD_TESTS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="$HOME/.local" +cmake --build . --config Release -j +make install -j +``` + +### Building MSys + +If you decide to run the application in a python virtual environment, perform the following steps in +the virtual environment. + +#### Preparation + +Execute the following instructions to install all the dependencies for the project. + +```bash +# install pip-compile for python package dependency resolution +python3 -m pip install pip-tools + +# configure MSys and generate a list of all required python packages +mkdir build && cd build +cmake .. +make generate_py3_requirements +python3 -m pip install -r ../requirements.txt +``` + +#### Build MSys Shared Library and Position the Output Product to `src/monitoring_sys` + +Run the following commands in the project's build folder. + +```bash +cmake -DCMAKE_BUILD_TYPE=Release .. +make libmsys_pymod -j +``` + +### Examples of Running MSys with Existing Code + +There are examples of how MSys can be properly configured and used with existing code in +`/example/monitoring_sys` + +1. `test_run.py` + - Try to configure a MSys with yaml file and test run it without load. + - Output will be saved to `/example/monitoring_sys/output/` + - Output folder contains + - Translated MSys config (user need to export this manually) + - `libmsys.log`, log file outputted by the libmsys submodule + - Possible `python_rt.log` if any error encountered in the python runtime + - Several `.pb.bin`, the serial format for the protobuf that records all the + statistics + - **(TBA)** A file contains the metadata for each of the serialized outputs for later + identifying and parsing of these serialized outputs +2. `test_parser.py` + - After running the MSys, attempt to create plots according to the data saved during the + recording phase. + - It will parse all the `.pb.bin` + > **TODO:** Currently only CPUMetrics and GPUMetrics have parser, and their field name is + > hard-coded, enhance the functionality with the metadata file in the future + +The organization format for each protobuf is specified as follows + +- A file might contains multiple protobuf serialized messages +- Each message have the following format (no padding between fields) + - An integer (64 bit) specifying the length for the current message (4B) + - A serialized protobuf message by the format `MetricsTimeSeries` (Length Specified + by the previous field) +- The message splitting policy is based on serialized protobuf size, so one message may contain + statistics for multiple time points, refer to the `resource/proto/_metrics.proto` + for more information +- Different parse format may be required for different monitor types + +### Potential Problems + +#### GCC version is too Low in Conda Environment + +##### Corresponding Error + +- ImportError: /lib/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by + /lib/libabsl_synchronization.so.2501.0.0) + +##### Solution + +The c++ compiler used is below minimum requirement, update c++ compiler. + + + +### Usage Examples + +#### Running + +`python3 example/monitoring_sys_lib/test_run.py` + +#### Stat Parsing + +`python3 example/monitoring_sys_lib/test_parser.py` diff --git a/monitoring_sys/include/cpu_meter.hh b/monitoring_sys/include/cpu_meter.hh new file mode 100644 index 0000000..f52e6b4 --- /dev/null +++ b/monitoring_sys/include/cpu_meter.hh @@ -0,0 +1,22 @@ +#pragma once + +#include "include/logger.hh" +#include "include/meter.hh" +#include "include/utils.hh" + +#include "generated/proto/cpu_metrics.pb.h" + +namespace MSys { + +class CPUMeter final : public Meter { + public: + CPUMeter(cr::milliseconds tick_period); + + bool update(bool testrun) override final; + std::string getDetailedReport() const override final; + + private: + const unsigned ncores; +}; + +} // namespace MSys diff --git a/monitoring_sys/include/disk_meter.hh b/monitoring_sys/include/disk_meter.hh new file mode 100644 index 0000000..f107218 --- /dev/null +++ b/monitoring_sys/include/disk_meter.hh @@ -0,0 +1,22 @@ +#pragma once + +#include "include/logger.hh" +#include "include/meter.hh" +#include "include/utils.hh" + +#include "generated/proto/disk_metrics.pb.h" + +namespace MSys { + +class DiskMeter final : public Meter { + public: + DiskMeter(cr::milliseconds tick_period, const std::vector &devices); + + bool update(bool testrun) override final; + std::string getDetailedReport() const override final; + + private: + const std::unordered_set devices; +}; + +} // namespace MSys \ No newline at end of file diff --git a/monitoring_sys/include/gpu_meter.hh b/monitoring_sys/include/gpu_meter.hh new file mode 100644 index 0000000..c5e009f --- /dev/null +++ b/monitoring_sys/include/gpu_meter.hh @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include + +#include "include/logger.hh" +#include "include/meter.hh" +#include "include/utils.hh" + +#include "generated/proto/gpu_metrics.pb.h" + +namespace MSys { + +struct NVMLProperties { + bool gpm_supported; +}; + +class GPUMeter final : public Meter { + public: + GPUMeter( + cr::milliseconds tick_period, const std::vector &gpu_ids, + const std::vector &nvml_metrics, const std::vector &gpm_metrics); + ~GPUMeter(); + + bool update(bool testrun) override final; + std::string getDetailedReport() const override final; + + private: + const std::vector gpu_ids; + const std::vector nvml_metrics; + const std::vector gpm_metrics; + + private: + /** + * Records if the meter has started to record data. This is used to + * determine if the first sample should be retrieved without getting the + * metrics. + */ + bool started = false; + /** + * Format for NVML GPM metrics get, used to retrieve GPM metrics. This + * variable should !!!NOT!!! be modified after the initialization + */ + nvmlGpmMetricsGet_t gpm_mg_format; + FixedSizeVector> nvml_devs; + FixedSizeVector> gpm_samples; +}; + +} // namespace MSys \ No newline at end of file diff --git a/monitoring_sys/include/logger.hh b/monitoring_sys/include/logger.hh new file mode 100644 index 0000000..d43cecc --- /dev/null +++ b/monitoring_sys/include/logger.hh @@ -0,0 +1,12 @@ +#pragma once + +#include "include/utils.hh" + +namespace MSys { + +bool loggerInitialize(const std::string &log_dir); +const fs::path &getLoggerFolder(); +const fs::path &getLoggerFile(); +void loggerDeinitialize(); + +} // namespace MSys diff --git a/monitoring_sys/include/mem_meter.hh b/monitoring_sys/include/mem_meter.hh new file mode 100644 index 0000000..88f8993 --- /dev/null +++ b/monitoring_sys/include/mem_meter.hh @@ -0,0 +1,26 @@ +#pragma once + +#include "include/logger.hh" +#include "include/meter.hh" +#include "include/utils.hh" + +#include "generated/proto/mem_metrics.pb.h" + +namespace MSys { +class MemMeter final : public Meter { + public: + MemMeter( + cr::milliseconds tick_period, + const std::vector &probes = {MemMetadata::MEM_BASIC}); + + bool update(bool testrun) override final; + std::string getDetailedReport() const override final; + + private: + const std::vector probes; + std::unique_ptr mem_info_repr; + + std::unordered_set mem_info_fields; +}; + +} // namespace MSys \ No newline at end of file diff --git a/monitoring_sys/include/meter.hh b/monitoring_sys/include/meter.hh new file mode 100644 index 0000000..798244e --- /dev/null +++ b/monitoring_sys/include/meter.hh @@ -0,0 +1,122 @@ +#pragma once + +#include +#include + +#include "include/utils.hh" + +namespace MSys { + +constexpr cr::milliseconds period_step{100}; + +class Meter { + public: + Meter( + const std::string &name, cr::milliseconds tick_period, + std::function stat_tser_factory, + const std::string &file_suffix = std::string(file_default_suffix)); + + /* Disable copy constructor */ + Meter(const Meter &) = delete; + + virtual ~Meter(); + + /** + * Probe once for statistics specified in the probe + * @return true if the probe was successful, false otherwise + */ + virtual bool update(bool testrun = false) = 0; + + virtual void resetBuffer() noexcept final; + + public: + const std::string_view getName() const; + const cr::milliseconds getTickPeriod() const; + + public: + /** + * Get the estimated memory consumption of current stat protobuf, calls + * internal .SpaceUsedLong() + * + * @return approximate memory consumption for the message in bytes + */ + virtual size_t getCurrentMessageMemorySize() final; + + /** + * Get the exact binary wire format size of current stat protobuf, calls + * internal .ByteSizeLong() + * + * @return exact binary wire format size for the message in bytes + */ + virtual size_t getCurrentMessageSerializedSize() final; + + virtual std::string getDetailedReport() const; + + public: + static constexpr std::string_view file_default_suffix = ".pb.bin"; + + protected: + /** Name of the meter, used in human-readable reports */ + const std::string name; + /** Suffix of the file, used in file output */ + const std::string file_suffix; + /** Meter record interval, in miliseconds */ + const cr::milliseconds tick_period; + + protected: + template + T *getCurrentBuffer() const; + + private: + /** Result protobuf time series */ + proto::Message *stat_tser; + std::atomic stat_tser_dbuffer; + + public: + // === File I/O === + /** + * Write the current stat_tser to the file descriptor asynchronously, + * this function will return immediately and the actual writing is done + * in a separate thread using std::async. + * + * @note sync option does NOT mean the file content will be synced + * @param sync if true, wait for the write to finish before returning + * @return the size of expected written data in bytes, or -1 on error + */ + virtual ssize_t writeDataToFile(bool sync = false) noexcept final; + + /** + * Force the file descriptor to sync to disk, this will ensure that all + * the data written to the file descriptor is flushed to disk. + * + * @note this function does NOT write any data to the file descriptor, + * it only flushes the data already written. + */ + virtual void fsyncDataToFile() noexcept final; + + public: + virtual void assignOutputDir(const fs::path &output_dir) final; + virtual const fs::path &getOutputPath() const final; + virtual size_t getWrittenTimes() const final; + virtual size_t getWrittenSize() const final; + + private: + fs::path file_path; + int fd = -1; + std::unique_ptr> async_write_ret; + std::atomic written_times = 0; + std::atomic written_size = 0; + + public: + bool isValid() const; + + protected: + void markValid(); + + private: + bool is_valid = false; +}; + +} // namespace MSys + +#include "include/meter.ipp" diff --git a/monitoring_sys/include/meter.ipp b/monitoring_sys/include/meter.ipp new file mode 100644 index 0000000..d4378f1 --- /dev/null +++ b/monitoring_sys/include/meter.ipp @@ -0,0 +1,10 @@ +#include "include/utils.hh" + +namespace MSys { + +template +T *Meter::getCurrentBuffer() const { + return dynamic_cast(stat_tser); +} + +} // namespace MSys \ No newline at end of file diff --git a/monitoring_sys/include/msys.hh b/monitoring_sys/include/msys.hh new file mode 100644 index 0000000..2bc2e9a --- /dev/null +++ b/monitoring_sys/include/msys.hh @@ -0,0 +1,228 @@ +#pragma once + +#include +#include +#include + +#include "include/utils.hh" + +#include "include/cpu_meter.hh" +#include "include/disk_meter.hh" +#include "include/gpu_meter.hh" +#include "include/mem_meter.hh" +#include "include/proc_meter.hh" + +namespace MSys { + +namespace Detail { + +void processTerminationHandler(bool); + +} // namespace Detail + +typedef int SystemID; +constexpr SystemID invalidSystemID = static_cast(-1); + +// Forward declaration +class WorkerInfo; + +static constexpr size_t default_msg_write_size_threshold = 1 * 1024 * 1024; // 2 MiB + +class System final { + public: + System( + SystemID id, const std::string &system_name, const fs::path &output_dir, + cr::milliseconds default_sample_period, const size_t msg_write_size_threshold); + + // disable copy constructor + System(const System &) = delete; + System &operator=(const System &) = delete; + + // disable move constructors + System(System &&) = delete; + System &operator=(System &&) = delete; + + ~System(); + + public: + /* NOTE: The following functions are used to manage the operation status of the system when the + * system is not running. The functions that are not explicitly marked as noexcept may throw + * exceptions if some conditions are not met. Refer to the documentation of each function for + * more details. */ + + /** + * Add a meter to the system. + * + * @param m Meter to be added + * @return Whether the meter is added successfully + * + * @note Will attempt to grab operation status mutex and check for system idleness. + */ + bool addMeter(std::unique_ptr &&m) noexcept; + + /** + * Start the system recording, start all the meter threads and begin to record data. + * + * @exception assertion_error thrown if the system is not correctly initialized. + * @return Whether the system is started successfully. + * + * @note Will attempt to grab operation status mutex and check for system idleness. + * @note An exception will be thrown if the system is not correctly initialized to avoid silent + * failures, so false is only returned when the system is already in operation. + */ + bool startRecording(); + + /** + * Stop the system recording, stop all the meter threads and persist all the data to disk. + * + * @return Whether the system is stopped successfully + * + * @note Will attempt to grab operation status mutex and check for system idleness. + */ + bool stopRecording() noexcept; + + /** + * Report the current status of the system in human-readable format + * + * @param verbose If true, print to stdout, print use logger otherwise + * + * @note Will attempt to grab operation status mutex and check for system idleness. + */ + void reportStatus(bool verbose = false, bool detail = false) noexcept; + + /** + * Perform a test run of the system, this will update all the meters in the system and return + * true if all the meters are updated successfully, false otherwise. + * + * @return True if all meters are tested and are are updated successfully without any errors, + * false otherwise + * + * @note Will attempt to grab operation status mutex and check for system idleness. + * @note This function will reset all the buffers of the meters before and after the test run. + * If the system is already in operation, the function will bail and return false. + */ + bool testRun(); + + /** + * Reset all the buffers of the meters in the system + * + * @note Will attempt to grab operation status mutex and check for system idleness. + * @note This function will bail if the system is in operation, and will not reset the buffers + */ + void resetAllBuffers() noexcept; + + private: + /** + * Reset all the buffers of the meters in the system without checking if the system is in + * operation + */ + void resetAllBuffersInternal() noexcept; + + public: + /* NOTE: The following functions are used to update the system while the + * system is running. The functions that are not explicitly marked as + * noexcept may throw exceptions if some conditions are not met. Refer + * to the documentation of each function for more details. */ + + /** + * Update the system, this will call the update function of all the meters in the system, and + * return true if all the meters are updated successfully, false otherwise. + * + * @return True if all meters are updated successfully, false otherwise + */ + bool update() noexcept; + + private: + /** + * Check if the system is currently recording + * + * @return True if the system is recording, false otherwise + * + * @note Will attempt to grab operation status mutex and check for system idleness. + */ + bool isRecording(); + + public: + static constexpr std::string_view system_default_name = ""; + SystemID getSystemID() const; + const std::string_view getSystemName() const; + const fs::path &getOutputDir() const; + size_t getMsgWriteSizeThreshold() const; + const cr::milliseconds &getDefaultSamplePeriod() const; + /** + * Check if all the meters in the system is valid + * + * @return True if the all meters are valid, false otherwise + * + * @note This function assumes the operation status mutex is locked + */ + bool isValid() const; + + private: + /** + * Halt the system and all worker threads, this will stop all the recording and persist all the + * data to disk. + * + * @note This function assumes the operation status mutex is locked + */ + void halt() noexcept; + + private: + const SystemID system_id; + const std::string system_name; + + const fs::path output_dir; + const size_t msg_write_size_threshold; + const cr::milliseconds default_sample_period; + + // state of the system running info + mutable std::mutex operation_status_mutex; + bool in_operation = false; + + // affiliated threads info + std::unique_ptr worker_info = nullptr; + + // meters + std::vector> meter_list; + + public: + friend class WorkerInfo; + // termination handler needs monitoring system internal states + friend void Detail::processTerminationHandler(bool); +}; + +class WorkerInfo { + public: + WorkerInfo() = delete; + WorkerInfo(System *, unsigned nmeters); + ~WorkerInfo(); + + private: + void coordinator_thread_func(); + void worker_thread_func(const unsigned thread_idx); + + System *system; + std::barrier<> worker_sync_point; + std::atomic worker_stop; + + FixedSizeVector> meter_update_durations; + FixedSizeVector meter_thread_finish_times; + + const cr::time_point system_creation_time; + + std::thread coordinator_thread; + std::vector worker_threads; +}; + +bool msysInitialize(const std::string &log_dir); + +// template +// SystemID constructNewSystem(Args &&...args); +SystemID constructNewSystem( + const std::string &output_dir, unsigned default_sample_period_ms, + const std::string &system_name = "", + const size_t msg_write_size_threshold = default_msg_write_size_threshold); +std::shared_ptr retrieveSystemUsingIndex(SystemID id); +bool msysTestRun(); + +} // namespace MSys diff --git a/monitoring_sys/include/proc_meter.hh b/monitoring_sys/include/proc_meter.hh new file mode 100644 index 0000000..cff1d45 --- /dev/null +++ b/monitoring_sys/include/proc_meter.hh @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +#include "include/logger.hh" +#include "include/meter.hh" +#include "include/utils.hh" + +#include "generated/proto/proc_metrics.pb.h" + +namespace MSys { + +class ProcMeter final : public Meter { + public: + ProcMeter( + cr::milliseconds tick_period, const std::vector &pids, + const std::vector &probes); + + bool update(bool testrun) override final; + std::string getDetailedReport() const override final; + + private: + const std::vector pids; + const std::unordered_set probes; +}; + +} // namespace MSys \ No newline at end of file diff --git a/monitoring_sys/include/utils.hh b/monitoring_sys/include/utils.hh new file mode 100644 index 0000000..5e05a23 --- /dev/null +++ b/monitoring_sys/include/utils.hh @@ -0,0 +1,259 @@ +#pragma once + +#include +#include +#include +#include +#include + +// cuda monitoring +#include +#include + +// absl logging +#include +#include +#include +#include +#include +#include +// other absl +#include + +// protobuf +#include + +// export functions +#define MSYS_EXPORT __attribute__((__visibility__("default"))) +#define MSYS_HIDDEN __attribute__((__visibility__("hidden"))) + +// global namespace alias +namespace cr = std::chrono; +namespace fs = std::filesystem; +namespace proto = google::protobuf; + +// copied from LinuxMachine.h from htop (https://github.com/htop-dev/htop) +#ifndef PROCDIR +#define PROCDIR "/proc" +#endif + +#ifndef PROCCPUINFOFILE +#define PROCCPUINFOFILE PROCDIR "/cpuinfo" +#endif + +// used in /proc/stat and /proc//stat +#ifndef STATFILE +#define STATFILE "/stat" +#endif + +#ifndef STATMFILE +#define STATMFILE "/statm" +#endif + +#ifndef IOFILE +#define IOFILE "/io" +#endif + +#ifndef PROCSTATFILE +#define PROCSTATFILE PROCDIR STATFILE +#endif + +#ifndef PROCMEMINFOFILE +#define PROCMEMINFOFILE PROCDIR "/meminfo" +#endif + +#ifndef PROCDISKSTATSFILE +#define PROCDISKSTATSFILE PROCDIR "/diskstats" +#endif + +// global helpper macro +#define printerr(...) fprintf(stderr, ##__VA_ARGS__) + +#define MSYS_EXPAND(x) x +#define MSYS_STRINGIFY(x) #x +#define MSYS_TOSTRING(x) MSYS_STRINGIFY(x) + +#define UNUSED(x) (void)(x) + +#if defined(__GNUC__) || defined(__clang__) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect(!!(x), 1) +#else +#define unlikely(x) (x) +#define likely(x) (x) +#endif + +#define nvmlCall(ret) \ + do { \ + if (ret != NVML_SUCCESS) { \ + LOG(ERROR) << absl::StrFormat( \ + "NVML call failed with return value %d (%s)", (int)ret, nvmlErrorString(ret)); \ + } \ + } while (false) + +// global helpper constexpr +template +constexpr unsigned log2Floor(T x); + +template +constexpr unsigned log10Floor(T x); + +template +constexpr unsigned log2Ceil(T x); + +template +constexpr unsigned log10Ceil(T x); + +// concepts +template +concept IsProtoMessage = std::is_base_of::value; + +// global helpper function +unsigned getSystemNProc(); +unsigned getSystemHz(); +cr::nanoseconds nsSinceEpoch(); + +/** + * @brief Indent each line of a string with a given prefix. + * @note This function is expensive because it format strings in a pretty way. + * @param input The input string to indent. + * @param prefix The prefix to add to each line. + * @return A new string with each line indented by the prefix. + */ +std::string indent(const std::string &input, const std::string &prefix); + +/** + * @brief Pad a value with designated character to a specified width. + * @param value The value to printed and padded. + * @param width The desired width. + * @param fill The character to use for padding. + * @return A string representation of the padded value. + */ +template +std::string strPad(T value, unsigned width, char fill = ' '); + +/** + * @brief Join a range of strings with a separator. + * @note This function is expensive because it format strings in a pretty way. + * @param begin The beginning of the range. + * @param end The end of the range. + * @param sep The separator to use between elements. + * @return A single string with all elements joined by the separator. + */ +template +std::string strJoin(const Iterator &begin, const Iterator &end, const std::string &sep); + +/** + * Validate whether a given path exists in current filesystem and return a + * fs::path object corresponding to it. + * + * @param dir target directory to be examined + * @return realpath of dir if the directory exists, and empty path if not + */ +fs::path validateDir(const std::string &dir); + +template +void verbosePrint(bool verbose, const char *format, Args... args); + +/* Fixed size vector that size can be determined dynamically at runtime by + marking every size-changing function as private */ +template > +class FixedSizeVector : private std::vector { + public: + using std::vector::vector; + using std::vector::size; + using std::vector::operator[]; + using std::vector::begin; + using std::vector::end; +}; + +/* Fixed size unordered map that size can be determined dynamically at runtime + by marking every size-changing function as private */ +template < + class Key, class T, class Hash = std::hash, class KeyEqual = std::equal_to, + class Allocator = std::allocator>> +class FixedSizeUnorderedMap : private std::unordered_map { + public: + using std::unordered_map::unordered_map; + using std::unordered_map::size; + using std::unordered_map::operator[]; + using std::unordered_map::find; + using std::unordered_map::begin; + using std::unordered_map::end; +}; + +/* Fixed size unordered set that size can be determined dynamically at runtime + by marking every size-changing function as private */ +template < + class Key, class Hash = std::hash, class KeyEqual = std::equal_to, + class Allocator = std::allocator> +class FixedSizeUnorderedSet : private std::unordered_set { + public: + using std::unordered_set::unordered_set; + using std::unordered_set::size; + using std::unordered_set::find; + using std::unordered_set::begin; + using std::unordered_set::end; +}; + +/** + * @brief Class to parse key-value representation from a file and give results in the form of + * protobuf messages. + * @note This function assumes the format of the stat file is not changed during the subsequent + * reads. + * @warning This function only supports the case where the key field take precedence over the value + * field, i.e., the first field is the key and the second field is the value + * This class reads a file containing key-value pairs and parses them into protobuf messages + * based on the provided descriptors and key lists. + */ +class KVRepr { + public: + /** + * @brief Constructor for KVRepr for a file. + * @param stat_file_path Path to the file containing key-value pairs. + * @param message_descs Vector of protobuf message descriptors. + * @param key_lists Vector of key lists corresponding to each message descriptor. + * @param field_scanf_format Format string for scanf to parse each line in the file. + * @param key_field_max_length Maximum length of the key field. + * @note Rules for scanf format: + * 1) Newline characters are not allowed in the format because line counting will be used + * to determine the position of each key-value pair in the file. + * 2) The format must contain exactly two fields, one for the key and one for the value. + * 3) The key and value should be strings specified using %s, scanset, or negated scanset. + * 4) The key field must take precedence over the value field. + */ + KVRepr( + const fs::path &stat_file_path, const std::vector &message_descs, + const std::vector> &key_lists, + const std::string &field_scanf_format = "%64s %32s ", + const unsigned key_field_max_length = 64, const unsigned val_field_max_length = 32); + bool parseOnce(std::vector &messages) const; + bool isValid() const; + + const fs::path &getStatFilePath() const; + std::string generateStatusReport() const; + + private: + const fs::path stat_file_path; + const std::vector message_descs; + const std::vector> key_lists; + const std::string field_scanf_format; + std::string field_fast_scanf_format; + const unsigned key_field_max_length; + const unsigned val_field_max_length; + + /** + * @brief (line_number) -> + * @note The container is ordered by line number to allow ordered traversal when scanning the + * file. + */ + std::map> kv_map; + /** + * @brief [] + * @note Only used on generating status report. + */ + std::vector> missing_fields; + bool valid = false; +}; + +#include "include/utils.ipp" diff --git a/monitoring_sys/include/utils.ipp b/monitoring_sys/include/utils.ipp new file mode 100644 index 0000000..a029364 --- /dev/null +++ b/monitoring_sys/include/utils.ipp @@ -0,0 +1,54 @@ +#include + +#include + +// global helpper constexpr +template +constexpr unsigned log2Floor(T x) { + return (x < 2) ? 0 : 1 + log2Floor(x >> 1); +} + +template +constexpr unsigned log10Floor(T x) { + return (x < 10) ? 0 : 1 + log10Floor(x / 10); +} + +template +constexpr unsigned log2Ceil(T x) { + return (x < 2) ? 0 : log2Floor(x - 1) + 1; +} + +template +constexpr unsigned log10Ceil(T x) { + return (x < 10) ? 0 : log10Floor(x - 1) + 1; +} + +template +std::string strPad(T value, unsigned width, char fill) { + std::string str = std::to_string(value); + std::ostringstream oss; + oss << std::setw(width) << std::setfill(fill) << str; + return oss.str(); +} + +template +std::string strJoin(const Iterator &begin, const Iterator &end, const std::string &sep) { + std::string result; + for (auto it = begin; it != end; ++it) { + if (!result.empty()) { + result += sep; + } + result += *it; + } + return result; +} + +template +void verbosePrint(bool verbose, const char *format, Args... args) { + if (verbose) { + fprintf(stderr, format, args...); + fputc('\n', stderr); + } else { + LOG(LEVEL(severity)) << absl::StrFormat(format, args...); + } +} diff --git a/monitoring_sys/msys_defs.cmake b/monitoring_sys/msys_defs.cmake new file mode 100644 index 0000000..a9827e3 --- /dev/null +++ b/monitoring_sys/msys_defs.cmake @@ -0,0 +1,60 @@ +# common monitoring_sys variables +# it sets the following variables for connivent target generation +# [MSYS_SOURCES]: all source files of the monitoring_sys +# [MSYS_LIBRARIES]: all libraries needed by the monitoring_sys +# [MSYS_INCLUDES]: all required include directories for the monitoring_sys +# [PROTO_PY_SOURCES]: generated protobuf python interfaces + +find_package(CUDAToolkit REQUIRED 12.4) +find_package(Python3 REQUIRED COMPONENTS Interpreter Development) + +# protobuf compilation +file(GLOB PROTO_SOURCES ${RESOURCE_DIR}/proto/*.proto) +proto_compile(MSYS_PROTO_DEP + SOURCE_DIR ${RESOURCE_DIR}/proto + CXX_DEST_DIR ${CMAKE_CURRENT_LIST_DIR}/generated/proto + PY_DEST_DIR ${PYTHON_SRC_DIR}/proto + GEN_SOURCES PROTO_SOURCES + SOURCES ${PROTO_SOURCES} +) + +# get generated cxx source files +set(PROTO_CC_SOURCES ${PROTO_SOURCES}) +list(FILTER PROTO_CC_SOURCES INCLUDE REGEX "\\.cc$") +# get generated python source files +set(PROTO_PY_SOURCES ${PROTO_SOURCES}) +list(FILTER PROTO_PY_SOURCES INCLUDE REGEX "\\.py$") + +# === determine [MSYS_SOURCES] === +set(MSYS_SOURCES "") +# find all build sources +file(GLOB MSYS_CC_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/*.cc) +# aggregate them +list(APPEND MSYS_SOURCES ${PROTO_CC_SOURCES}) +list(APPEND MSYS_SOURCES ${DATELIB_SOURCES}) +list(APPEND MSYS_SOURCES ${MSYS_CC_SOURCES}) + +# === determine [MSYS_DEPENDS] === +# find all dependencies +set(MSYS_DEPENDS + # CUDA libraries + CUDA::cupti + CUDA::nvml + # protobuf and absl libraries + protobuf::libprotobuf + absl::log + # for zoned date support + date::date-tz) + +# === MSYS_INCLUDES [MSYS_INCLUDES] === +# find all includes +set(MSYS_INCLUDES + # project + ${CMAKE_CURRENT_LIST_DIR} + # third party + ${PYBIND11_INCLUDES} + ${DATELIB_INCLUDES} + # external libraries + ${CUDAToolkit_INCLUDE_DIRS} + ${Python3_INCLUDE_DIRS} + ${Protobuf_INCLUDE_DIRS}) diff --git a/monitoring_sys/src/cpu_meter.cc b/monitoring_sys/src/cpu_meter.cc new file mode 100644 index 0000000..235c7e7 --- /dev/null +++ b/monitoring_sys/src/cpu_meter.cc @@ -0,0 +1,168 @@ +#include "include/cpu_meter.hh" + +namespace MSys { + +namespace Detail { + +static const char *core_stat_format = + "%*s " // (1) [NT] cpu %s (in format of "cpu\d*") + "%lu " // (2) [1] user %llu + "%lu " // (3) [2] nice %llu + "%lu " // (4) [3] system %llu + "%lu " // (5) [4] idle %llu + "%lu " // (6) [5] iowait %llu + "%lu " // (7) [6] irq %llu + "%lu " // (8) [7] softirq %llu + "%lu " // (9) [8] steal %llu + "%lu " // (10) [9] guest %llu + "%lu " // (11) [10] guest_nice %llu + ; + +static const char *kernel_misc_stat_format = + "intr %lu %*[^\n] " // (1) [1, NT] intr %lu + "ctxt %lu " // (2) [2] ctxt %lu + "btime %*lu " // (3) [NT] btime %lu + "processes %lu " // (4) [3] processes %lu + "procs_running %u " // (5) [4] procs_running %u + "procs_blocked %u " // (6) [5] procs_blocked %u + ; + +static const char *softirq_stat_format = + "softirq " // (1) [NT] softirq %s + "%lu " // (2) [1] total %lu + "%lu " // (3) [2] hi %lu + "%lu " // (4) [3] timer %lu + "%lu " // (5) [4] net_tx %lu + "%lu " // (6) [5] net_rx %lu + "%lu " // (7) [6] block %lu + "%lu " // (8) [7] irq_poll %lu + "%lu " // (9) [8] tasklet %lu + "%lu " // (10) [9] sched %lu + "%lu " // (11) [10] hrtimer %lu + "%lu " // (12) [11] rcu %lu + ; + +static inline bool parseProcStat(unsigned ncores, CPUMetrics *metrics) { + FILE *fp = fopen(PROCSTATFILE, "r"); + + if (unlikely(!fp)) { + LOG(ERROR) << absl::StrFormat( + "[CPUMeter] Failed to open %s: %s", PROCSTATFILE, strerror(errno)); + return false; + } + + // time stamp + metrics->set_timestamp(cr::steady_clock::now().time_since_epoch().count()); + + bool ret = true; + // Core stats + { + unsigned long user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice; + for (unsigned core_stat_idx = 0; core_stat_idx < ncores + 1; core_stat_idx++) { + CoreStat *core_stat = metrics->add_core_stats(); + int nfields = fscanf( + fp, core_stat_format, &user, &nice, &system, &idle, &iowait, &irq, &softirq, &steal, + &guest, &guest_nice); + if (unlikely(nfields < 10)) { + // If we don't have all fields, we can still proceed with the available + // ones. + LOG(WARNING) << absl::StrFormat( + "[CPUMeter] Expected 10 fields in /proc/stat for core %u, got %d. " + "Some metrics may be missing.", + core_stat_idx - 1, nfields); + ret = false; + } + core_stat->set_user(user); + core_stat->set_nice(nice); + core_stat->set_system(system); + core_stat->set_idle(idle); + core_stat->set_iowait(iowait); + core_stat->set_irq(irq); + core_stat->set_softirq(softirq); + core_stat->set_steal(steal); + core_stat->set_guest(guest); + core_stat->set_guest_nice(guest_nice); + } + } + + // Kernel misc stats + { + KernelMiscStat *misc_stat = metrics->mutable_kernel_misc_stat(); + unsigned long intr, ctxt, processes; + unsigned procs_running, procs_blocked; + + int nfields = fscanf( + fp, kernel_misc_stat_format, &intr, &ctxt, &processes, &procs_running, &procs_blocked); + if (unlikely(nfields < 5)) { + // If we don't have all fields, we can still proceed with the available + // ones. + LOG(WARNING) << absl::StrFormat( + "[CPUMeter] Expected 5 fields in /proc/stat, got %d. " + "Some metrics may be missing.", + nfields); + ret = false; + } + misc_stat->set_intr(intr); + misc_stat->set_ctxt(ctxt); + misc_stat->set_processes(processes); + misc_stat->set_procs_running(procs_running); + misc_stat->set_procs_blocked(procs_blocked); + } + + // SoftIRQ stats + { + SoftIRQStat *softirq_stat = metrics->mutable_soft_irq_stat(); + unsigned long total, hi, timer, net_tx, net_rx, block, irq_poll, tasklet, sched, hrtimer, + rcu; + + int nfields = fscanf( + fp, softirq_stat_format, &total, &hi, &timer, &net_tx, &net_rx, &block, &irq_poll, + &tasklet, &sched, &hrtimer, &rcu); + if (unlikely(nfields < 11)) { + // If we don't have all fields, we can still proceed with the available + // ones. + LOG(WARNING) << absl::StrFormat( + "[CPUMeter] Expected 11 fields in /proc/softirqs, got %d. " + "Some metrics may be missing.", + nfields); + ret = false; + } + softirq_stat->set_total(total); + softirq_stat->set_hi(hi); + softirq_stat->set_timer(timer); + softirq_stat->set_net_tx(net_tx); + softirq_stat->set_net_rx(net_rx); + softirq_stat->set_block(block); + softirq_stat->set_irq_poll(irq_poll); + softirq_stat->set_tasklet(tasklet); + softirq_stat->set_sched(sched); + softirq_stat->set_hrtimer(hrtimer); + softirq_stat->set_rcu(rcu); + } + + fclose(fp); + return ret; +} + +} // namespace Detail + +CPUMeter::CPUMeter(cr::milliseconds tick_period) + : Meter("CPUMeter", tick_period, [] { return new CPUMetricsTimeSeries(); }), + ncores(getSystemNProc()) { + markValid(); +} + +bool CPUMeter::update(bool testrun) { + UNUSED(testrun); + + CPUMetrics *cpu_metrics = getCurrentBuffer()->add_metrics(); + return Detail::parseProcStat(ncores, cpu_metrics); +} + +std::string CPUMeter::getDetailedReport() const { + std::string report; + report += absl::StrFormat("Number of CPU cores: %u\n", ncores); + return report; +} + +} // namespace MSys diff --git a/monitoring_sys/src/disk_meter.cc b/monitoring_sys/src/disk_meter.cc new file mode 100644 index 0000000..0602a61 --- /dev/null +++ b/monitoring_sys/src/disk_meter.cc @@ -0,0 +1,182 @@ +#include "include/disk_meter.hh" + +namespace MSys { + +namespace Detail { + +// FIXME: the number of characters in device_cstr is limited to 64 +static constexpr unsigned device_cstr_size = 64; + +static const char *proc_diskstats_header_format = + "%*d " // (1) [NT] major %d + "%*d " // (2) [NT] minor %d + "%s " // (3) [1] device %s + ; + +static const char *proc_diskstats_format = + "%lu " // (4) [1] reads_completed %lu + "%lu " // (5) [2] reads_merged %lu + "%lu " // (6) [3] sectors_read %lu + "%lu " // (7) [4] time_spent_reading_ms %lu + "%lu " // (8) [5] writes_completed %lu + "%lu " // (9) [6] writes_merged %lu + "%lu " // (10) [7] sectors_written %lu + "%lu " // (11) [8] time_spent_writing_ms %lu + "%lu " // (12) [9] io_in_progress %lu + "%lu " // (13) [10] time_spent_io_ms %lu + "%lu " // (14) [11] weighted_time_spent_io_ms %lu + "%lu " // (15) [12] discard_completed %lu + "%lu " // (16) [13] discard_merged %lu + "%lu " // (17) [14] discard_sectors %lu + "%lu " // (18) [15] time_spent_discarding_ms %lu + "%lu " // (19) [16] flush_completed %lu + "%lu " // (20) [17] time_spent_flushing_ms %lu + ; + +static inline bool parseProcDiskstats( + const std::unordered_set &devices, DiskMetrics *metrics) { + FILE *fp = fopen(PROCDISKSTATSFILE, "r"); + if (unlikely(!fp)) { + LOG(ERROR) << absl::StrFormat( + "[DiskMeter] Failed to open %s: %s", PROCDISKSTATSFILE, strerror(errno)); + fclose(fp); + return false; + } + + std::unordered_set remaining_devices(devices); + + // time stamp + metrics->set_timestamp(cr::steady_clock::now().time_since_epoch().count()); + + while (remaining_devices.size() > 0) { + char device_cstr[device_cstr_size]; + int nfields = fscanf(fp, proc_diskstats_header_format, &device_cstr); + if (nfields == EOF) { + // EOF reached before the stat for all the devices are read + LOG(WARNING) << absl::StrFormat( + "[DiskMeter] EOF reached while reading %s, remaining devices: %zu", + PROCDISKSTATSFILE, remaining_devices.size()); + fclose(fp); + return false; + } + std::string device(device_cstr); + if (remaining_devices.find(device) == remaining_devices.end()) { + // Skip the rest of the line + // FIXME: discarding return result for a function that is marked as + // [[nodiscard]] + int discard = fscanf(fp, "%*[^\n] "); + (void)discard; + continue; + } + + unsigned long reads_completed, reads_merged, sectors_read, time_spent_reading; + unsigned long writes_completed, writes_merged, sectors_written, time_spent_writing; + unsigned long io_in_progress, time_spent_io, weighted_time_spent_io; + unsigned long discard_completed, discard_merged, discard_sectors, time_spent_discarding; + unsigned long flush_completed, time_spent_flushing; + + PerDiskMetrics *disk_stat = metrics->add_disk_metrics(); + nfields = fscanf( + fp, proc_diskstats_format, &reads_completed, &reads_merged, §ors_read, + &time_spent_reading, &writes_completed, &writes_merged, §ors_written, + &time_spent_writing, &io_in_progress, &time_spent_io, &weighted_time_spent_io, + &discard_completed, &discard_merged, &discard_sectors, &time_spent_discarding, + &flush_completed, &time_spent_flushing); + if (unlikely(nfields < 17)) { + LOG(WARNING) << absl::StrFormat( + "[DiskMeter] Expected 18 fields in %s, got %d. Some metrics may be missing.", + PROCDISKSTATSFILE, nfields); + } + + disk_stat->set_reads_completed(reads_completed); + disk_stat->set_reads_merged(reads_merged); + disk_stat->set_sectors_read(sectors_read); + disk_stat->set_time_spent_reading(time_spent_reading); + disk_stat->set_writes_completed(writes_completed); + disk_stat->set_writes_merged(writes_merged); + disk_stat->set_sectors_written(sectors_written); + disk_stat->set_time_spent_writing(time_spent_writing); + disk_stat->set_io_in_progress(io_in_progress); + disk_stat->set_time_spent_io(time_spent_io); + disk_stat->set_weighted_time_spent_io(weighted_time_spent_io); + disk_stat->set_discard_completed(discard_completed); + disk_stat->set_discard_merged(discard_merged); + disk_stat->set_discard_sectors(discard_sectors); + disk_stat->set_time_spent_discarding(time_spent_discarding); + disk_stat->set_flush_completed(flush_completed); + disk_stat->set_time_spent_flushing(time_spent_flushing); + + remaining_devices.erase(device); + } + + fclose(fp); + return true; +} + +bool checkDiskExistence(const std::unordered_set &devices) { + std::unordered_multiset disks; + + FILE *fp = fopen(PROCDISKSTATSFILE, "r"); + if (!fp) { + LOG(WARNING) << absl::StrFormat("[DiskMeter] Failed to open %s", PROCDISKSTATSFILE); + return false; + } + + std::unordered_set remaining_devices = devices; + while (remaining_devices.size() > 0) { + char device_cstr[device_cstr_size]; + int nfields = fscanf(fp, proc_diskstats_header_format, &device_cstr); + if (nfields == EOF) { + std::string warning_msg = + "[DiskMeter] Not all devices required exist, list of nonexistent devices:"; + for (const std::string &device : remaining_devices) { + warning_msg += absl::StrFormat(" %s", device); + } + LOG(WARNING) << warning_msg; + fclose(fp); + return false; + } + std::string device(device_cstr); + auto it = remaining_devices.find(device); + if (it != remaining_devices.end()) remaining_devices.erase(it); + + // discard the rest of the line + int discard = fscanf(fp, "%*[^\n] "); + (void)discard; + } + + fclose(fp); + return true; +} + +} // namespace Detail + +DiskMeter::DiskMeter(cr::milliseconds tick_period, const std::vector &devices) + : Meter("DiskMeter", tick_period, [] { return new DiskMetricsTimeSeries(); }), + devices(devices.begin(), devices.end()) { + if (!Detail::checkDiskExistence(this->devices)) { + LOG(ERROR) << absl::StrFormat( + "[DiskMeter] Some devices do not exist in %s", PROCDISKSTATSFILE); + return; + } + + markValid(); +} + +bool DiskMeter::update(bool testrun) { + UNUSED(testrun); + + DiskMetrics *metrics = getCurrentBuffer()->add_metrics(); + return Detail::parseProcDiskstats(devices, metrics); +} + +std::string DiskMeter::getDetailedReport() const { + std::string report; + report += absl::StrFormat("Monitored devices:"); + for (const auto &dev : devices) { + report += absl::StrFormat("\n - %s", dev); + } + return report; +} + +} // namespace MSys \ No newline at end of file diff --git a/monitoring_sys/src/gpu_meter.cc b/monitoring_sys/src/gpu_meter.cc new file mode 100644 index 0000000..33a6cb7 --- /dev/null +++ b/monitoring_sys/src/gpu_meter.cc @@ -0,0 +1,273 @@ +#include "include/gpu_meter.hh" + +namespace MSys { + +namespace Detail { + +/** + * Retrieve the support status of GPM capabilities for given GPU + * NOTE: NVML GPM is supported on Hopper or newer fully supported devices, refer + * to NVIDIA GPM documentations at + * https://docs.nvidia.com/deploy/nvml-api/group__nvmlGpmFunctions.html + * + * @param nvml_dev target device to query + * @return whether the device supports GPM + */ +static inline bool isGPMSupported(const nvmlDevice_t &nvml_dev) { + nvmlGpmSupport_t ret; + ret.version = NVML_GPM_SUPPORT_VERSION; + nvmlCall(nvmlGpmQueryDeviceSupport(nvml_dev, &ret)); + return ret.isSupportedDevice; +} + +// NOTE: with (potential) expensive string formation cost +static inline std::string getDeviceName(const nvmlDevice_t &nvml_dev) { + constexpr unsigned name_length = NVML_DEVICE_NAME_V2_BUFFER_SIZE; + char name[name_length]; + nvmlCall(nvmlDeviceGetName(nvml_dev, name, name_length)); + return std::string(name); +} + +// NOTE: with (potential) expensive string formation cost +static inline std::string getDeviceBusID(const nvmlDevice_t &nvml_dev) { + nvmlPciInfo_t nvml_pci_info; + nvmlCall(nvmlDeviceGetPciInfo(nvml_dev, &nvml_pci_info)); + return std::string(nvml_pci_info.busId); +} + +static inline std::pair getCUDAComputeCapability(const nvmlDevice_t &nvml_dev) { + int major = 0, minor = 0; + nvmlCall(nvmlDeviceGetCudaComputeCapability(nvml_dev, &major, &minor)); + return std::make_pair(major, minor); +} + +static inline unsigned getDevicePCIeLinkGeneration(const nvmlDevice_t &nvml_dev) { + unsigned link_gen; + nvmlCall(nvmlDeviceGetCurrPcieLinkGeneration(nvml_dev, &link_gen)); + return link_gen; +} + +static inline unsigned getDevicePCIeLinkWidth(const nvmlDevice_t &nvml_dev) { + unsigned link_width; + nvmlCall(nvmlDeviceGetCurrPcieLinkGeneration(nvml_dev, &link_width)); + return link_width; +} + +static inline void parseGPUProperties(const nvmlDevice_t &nvml_dev, GPUProperties *metadata) { + // metadata->dev_name + metadata->set_dev_name(getDeviceName(nvml_dev)); + // metadata->bus_id + metadata->set_bus_id(getDeviceBusID(nvml_dev)); + // metadata->compute_capability + std::pair device_CC = getCUDAComputeCapability(nvml_dev); + CUDACC *cc = metadata->mutable_compute_capability(); + cc->set_major(device_CC.first); + cc->set_minor(device_CC.second); + // metadata->link_generation + metadata->set_link_generation(getDevicePCIeLinkGeneration(nvml_dev)); + // metadata->link_width + metadata->set_link_width(getDevicePCIeLinkWidth(nvml_dev)); +} + +static inline bool parseGPUNVML( + unsigned gpu_id, const nvmlDevice_t &nvml_dev, const std::vector &nvml_metrics, + PerGPUMetrics *metrics) { + // FIXME: not implemented for now + UNUSED(gpu_id); + UNUSED(nvml_dev); + UNUSED(nvml_metrics); + UNUSED(metrics); + return true; +} + +static inline bool parseGPUGPM( + unsigned gpu_id, const nvmlDevice_t &nvml_dev, nvmlGpmMetricsGet_t &mg, + nvmlGpmSample_t &sample1, nvmlGpmSample_t &sample2, PerGPUMetrics *metrics) { + mg.sample1 = sample1; + mg.sample2 = sample2; + nvmlCall(nvmlGpmSampleGet(nvml_dev, sample2)); + nvmlCall(nvmlGpmMetricsGet(&mg)); + + std::swap(sample1, sample2); + if (mg.metrics->nvmlReturn != NVML_SUCCESS) { + LOG(ERROR) << absl::StrFormat( + "[GPUMeter] NVML GPM metrics get failed for GPU %d: %d (%s)", gpu_id, + mg.metrics->nvmlReturn, nvmlErrorString(mg.metrics->nvmlReturn)); + return false; + } + + // get all metrics values from mg + for (unsigned metrics_id = 0; metrics_id < mg.numMetrics; metrics_id++) { + metrics->add_gpm_metrics_values(mg.metrics[metrics_id].value); + } + return true; +} + +static inline bool parseGPUProcesses( + unsigned gpu_id, const nvmlDevice_t &nvml_dev, PerGPUMetrics *metrics) { + UNUSED(gpu_id); + + unsigned info_count = 0; + nvmlDeviceGetComputeRunningProcesses(nvml_dev, &info_count, nullptr); + + nvmlProcessInfo_t *infos = new nvmlProcessInfo_t[info_count]; + nvmlCall(nvmlDeviceGetComputeRunningProcesses(nvml_dev, &info_count, infos)); + + for (unsigned i = 0; i < info_count; i++) { + const nvmlProcessInfo_t &info = infos[i]; + PerProcessGPUMetrics *process_metrics = metrics->add_per_process_gpu_metrics(); + process_metrics->set_pid(info.pid); + process_metrics->set_used_gpu_memory(info.usedGpuMemory); + } + + delete[] infos; + return true; +} + +} // namespace Detail + +static constexpr cr::milliseconds min_tick_period{100}; + +GPUMeter::GPUMeter( + cr::milliseconds tick_period, const std::vector &gpu_ids, + const std::vector &nvml_metrics, const std::vector &gpm_metrics) + : Meter("GPUMeter", tick_period, [] { return new GPUMetricsTimeSeries(); }), + gpu_ids(gpu_ids), + nvml_metrics(nvml_metrics), + gpm_metrics(gpm_metrics), + nvml_devs(gpu_ids.size()), + gpm_samples(gpu_ids.size()) { + if (tick_period < min_tick_period) { + LOG(WARNING) << absl::StrFormat( + "[GPUMeter] GPM tick period should be greater than %d, get %d, " + "enforcing %d", + min_tick_period.count(), tick_period.count(), min_tick_period.count()); + tick_period = min_tick_period; + } + + // initialize nvml for corresponding devices + nvmlCall(nvmlInit()); + for (unsigned gpu_idx = 0; gpu_idx < gpu_ids.size(); gpu_idx++) { + unsigned gpu_id = gpu_ids[gpu_idx]; + + nvmlDevice_t nvml_dev; + nvmlReturn_t ret = nvmlDeviceGetHandleByIndex(gpu_id, &nvml_dev); + if (ret != NVML_SUCCESS) { + LOG(ERROR) << absl::StrFormat( + "[GPUMeter] NVML cannot be attached to GPU with ID: %d, dropping", gpu_id); + continue; + } + + NVMLProperties nvml_prop = {}; + // check if GPM is supported on the device + nvml_prop.gpm_supported = Detail::isGPMSupported(nvml_dev); + if (!nvml_prop.gpm_supported) + LOG(ERROR) << absl::StrFormat( + "[GPUMeter] GPU with ID: %d does not support GPM", gpu_id); + + // add to tracing candidates + nvml_devs[gpu_idx] = std::make_pair(nvml_dev, nvml_prop); + nvmlCall(nvmlGpmSampleAlloc(&gpm_samples[gpu_idx].first)); + nvmlCall(nvmlGpmSampleAlloc(&gpm_samples[gpu_idx].second)); + } + + gpm_mg_format.version = NVML_GPM_METRICS_GET_VERSION; + gpm_mg_format.numMetrics = (unsigned)gpm_metrics.size(); + for (size_t metrics_idx = 0; metrics_idx < gpm_metrics.size(); metrics_idx++) { + unsigned metric_id = gpm_metrics[metrics_idx]; + gpm_mg_format.metrics[metrics_idx].metricId = static_cast(metric_id); + } + + markValid(); +} + +GPUMeter::~GPUMeter() { nvmlCall(nvmlShutdown()); } + +bool GPUMeter::update(bool testrun) { + /* + * NVML GPM metrics need two samples to calculate the metrics, so the first + * time we call this function, the first sample is retrieved without + * getting te metrics. + */ + + if (unlikely(testrun)) { + for (unsigned gpu_idx = 0; gpu_idx < gpu_ids.size(); gpu_idx++) + nvmlCall(nvmlGpmSampleGet(nvml_devs[gpu_idx].first, gpm_samples[gpu_idx].first)); + } + + if (unlikely(!testrun && !started)) { + for (unsigned gpu_idx = 0; gpu_idx < gpu_ids.size(); gpu_idx++) + nvmlCall(nvmlGpmSampleGet(nvml_devs[gpu_idx].first, gpm_samples[gpu_idx].first)); + started = true; + return true; + } + + GPUMetrics *gpu_metrics = getCurrentBuffer()->add_metrics(); + gpu_metrics->set_timestamp(cr::steady_clock::now().time_since_epoch().count()); + + int ret = true; + for (unsigned gpu_idx = 0; gpu_idx < gpu_ids.size(); gpu_idx++) { + unsigned gpu_id = gpu_ids[gpu_idx]; + + PerGPUMetrics *per_gpu_metrics = gpu_metrics->add_per_gpu_metrics(); + + // parse NVML metrics + ret &= + Detail::parseGPUNVML(gpu_id, nvml_devs[gpu_idx].first, nvml_metrics, per_gpu_metrics); + + // parse GPM metrics + nvmlGpmMetricsGet_t mg; + memcpy(&mg, &gpm_mg_format, sizeof(nvmlGpmMetricsGet_t)); + ret &= Detail::parseGPUGPM( + gpu_id, nvml_devs[gpu_idx].first, mg, gpm_samples[gpu_idx].first, + gpm_samples[gpu_idx].second, per_gpu_metrics); + + ret &= Detail::parseGPUProcesses(gpu_id, nvml_devs[gpu_idx].first, per_gpu_metrics); + } + + return ret; +} + +std::string GPUMeter::getDetailedReport() const { + std::string report = absl::StrFormat( + "GPUMeter: recording %d GPU(s), #NVML metrics: %d, #GPM metrics: %d", gpu_ids.size(), + nvml_metrics.size(), gpm_metrics.size()); + report += "\nGPU details:"; + for (unsigned gpu_idx = 0; gpu_idx < gpu_ids.size(); gpu_idx++) { + unsigned gpu_id = gpu_ids[gpu_idx]; + report += absl::StrFormat( + "\n - GPU %d (%s)", gpu_id, + nvml_devs[gpu_idx].second.gpm_supported ? "GPM supported" : "GPM NOT supported"); + } + + if (nvml_metrics.size() > 0) { + report += "\nNVML enabled probe(s):"; + const proto::EnumDescriptor *nvml_enum_desc = + proto::GetEnumDescriptor(); + for (const auto &metric : nvml_metrics) { + unsigned metric_value = static_cast(metric); + const proto::EnumValueDescriptor *value_desc = + nvml_enum_desc->FindValueByNumber(metric_value); + report += absl::StrFormat( + "\n - %s.%s (%d)", nvml_enum_desc->full_name().data(), value_desc->name().data(), + metric_value); + } + } + + if (gpm_metrics.size() > 0) { + report += "\nGPM enabled probe(s):"; + const proto::EnumDescriptor *gpm_enum_desc = + proto::GetEnumDescriptor(); + for (const auto &metric : gpm_metrics) { + unsigned metric_value = static_cast(metric); + const proto::EnumValueDescriptor *value_desc = + gpm_enum_desc->FindValueByNumber(metric_value); + report += absl::StrFormat( + "\n - %s.%s (%d)", gpm_enum_desc->full_name().data(), value_desc->name().data(), + metric_value); + } + } + return report; +} + +} // namespace MSys \ No newline at end of file diff --git a/monitoring_sys/src/interface.cc b/monitoring_sys/src/interface.cc new file mode 100644 index 0000000..38aa524 --- /dev/null +++ b/monitoring_sys/src/interface.cc @@ -0,0 +1,276 @@ +#include +#include + +#ifdef PYBIND11_RICH_INTERFACE +#include "generated/interface/pybind11_defs.h" +#endif +#include "include/msys.hh" + +namespace py = pybind11; + +/** + * NOTE: [Write Path Validation] All of the path exists inputted into the + * system will only be validated at input checking time. Any changes to the + * in-filesystem state for the inputted paths and any parent paths ( + * including moving, renaming, and other actions that can possibly make the + * inputted path invalid) are considered undefined behavior and will not be + * actively checked by the system at runtime. + * + * TODO: Make sure all required path for the system are validated at system + * initialization phase, so that the external FS change will only be causing + * undefined behavior when they are done during initialization phase. + */ + +// Contains all the interface functions that should be registered with pybind11 +namespace MSys::Interface { + +namespace Detail { + +/** + * Get appropiate sampling period given a monitoring sys and external user input + * @note system share_ptr is taken by reference, caller need to ensure the + * shared_ptr is not destroyed during this function call + * @param system system the meter is being added to + * @param period_ms sampling period specified by user, equal to 0 if the user + * leave the option empty + * @return appropiate sampling period for the meter + */ +inline cr::milliseconds getSamplePeriod(std::shared_ptr &system, unsigned period_ms) { + return period_ms == 0 ? system->getDefaultSamplePeriod() : cr::milliseconds{period_ms}; +} + +inline bool addMeterToSystem(std::shared_ptr &system, std::unique_ptr &&meter) { + std::string_view meter_name = meter->getName(); + bool status = system->addMeter(std::move(meter)); + if (!status) + LOG(WARNING) << absl::StrFormat( + "[Interface] Try adding %s meter to system %d failed", meter_name, + system->getSystemID()); + return status; +} + +} // namespace Detail + +/** + * Initialize the underlying monitoring system + * + * @param log_dir directory to place log, must exist and writeable, otherwise + * the initialization will fail + * @return whether the initialization is successful + */ +bool initialize(const std::string &log_dir) { return msysInitialize(log_dir); } + +/** + * Construct a monitoring system and return its index as an identifier to access + * that instance. An individual thread will be spawned to handle one meter + * + * @param default_sample_period_ms default sample period for all the meters + * added to the system if a explicit sample period is not given at meter + * creation time + * @return an ID associated with the system + */ +SystemID getMonitoringSystem( + const std::string &output_dir, unsigned default_sample_period_ms = 500) { + return constructNewSystem(output_dir, default_sample_period_ms); +} + +/** + * Add a monitor probe to CPU + * + * @param id target SystemID to add the probe + * @param sample_period_ms sample period, same as system if not specified + * @return whether adding the probe is successful + */ +bool addCPUMeterToSystem(SystemID id, unsigned sample_period_ms = 0) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + + return Detail::addMeterToSystem( + system, std::make_unique(Detail::getSamplePeriod(system, sample_period_ms))); +} + +/** + * Add a monitor probe to some GPUs + * + * @param id target SystemID to add the probe + * @param gpu_ids list of GPUs to probe + * @param nvml_metrics list of NVML metrics + * @param gpm_metrics list of GPM metrics + * @param sample_period_ms sample period, same as system if not specified + * @return whether adding the probe is successful + */ +bool addGPUMeterToSystem( + SystemID id, std::vector gpu_ids, std::vector nvml_metrics, + std::vector gpm_metrics, unsigned sample_period_ms = 0) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + + return Detail::addMeterToSystem( + system, + std::make_unique( + Detail::getSamplePeriod(system, sample_period_ms), gpu_ids, nvml_metrics, gpm_metrics)); +} + +/** + * Add a monitor probe to some block devices + * + * @param id target SystemID to add the probe + * @param devices list of devices to monitor + * @param sample_period_ms sample period, same as system if not specified + * @return whether adding the probe is successful + */ +bool addDiskMeterToSystem( + SystemID id, std::vector devices, unsigned sample_period_ms = 0) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + + std::unique_ptr meter = + std::make_unique(Detail::getSamplePeriod(system, sample_period_ms), devices); + return system->addMeter(std::move(meter)); +} + +/** + * Add a monitor probe to some processes + * + * @param id target SystemID to add the probe + * @param pids list of processes to monitor + * @param probes list of probes to monitor, refer to ProcMetadata::Probe + * @param sample_period_ms sample period, same as system if not specified + * @return whether adding the probe is successful + */ +bool addProcMeterToSystem( + SystemID id, const std::vector &pids, const std::vector &probes, + unsigned sample_period_ms = 0) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + + std::vector input_probes; + for (auto probe : probes) + input_probes.push_back(static_cast(probe)); + + std::unique_ptr meter = std::make_unique( + Detail::getSamplePeriod(system, sample_period_ms), pids, input_probes); + return system->addMeter(std::move(meter)); +} + +/** + * Add a memory monitor probe to the system + * + * @param id target SystemID to add the probe + * @param probes list of probes to monitor, refer to MemMetadata::Probe + * @param sample_period_ms sample period, same as system if not specified + * @return whether adding the probe is successful + */ +bool addMemMeterToSystem( + SystemID id, const std::vector &probes, unsigned sample_period_ms = 0) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + + std::vector input_probes; + for (auto probe : probes) + input_probes.push_back(static_cast(probe)); + + std::unique_ptr meter = + std::make_unique(Detail::getSamplePeriod(system, sample_period_ms), input_probes); + return system->addMeter(std::move(meter)); +} + +bool startRecording(SystemID id) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + return system->startRecording(); +} + +bool stopRecording(SystemID id) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + return system->stopRecording(); +} + +void reportStatus(SystemID id, bool verbose = false, bool detail = false) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) { + verbosePrint(verbose, "System with ID %d does not exist", id); + return; + } + system->reportStatus(verbose, detail); +} + +bool testRun(SystemID id, bool fail_on_error = false) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + + // Perform a test run, which will update all meters in the system + bool ret = system->testRun(); + if (!ret) { + absl::LogSeverity severity = + fail_on_error ? absl::LogSeverity::kFatal : absl::LogSeverity::kError; + LOG(LEVEL(severity)) << absl::StrFormat( + "[Interface] System %d (%s) Test run FAILED", id, system->getSystemName().data()); + } + return ret; +} + +} // namespace MSys::Interface + +// === Internal details BEGIN === +// Expose a function to python using the same function name in c++ +// NOTE: The function to be registered must resides in namespace MSys::Interface +// TODO: Extract interface namespace into new macro to allow quick modification +#if defined(PYBIND11_RICH_INTERFACE) && defined(PYBIND11_ARG_INFO_GEN) +// Use a modified pybind11-mkdoc with arg info in macros +/* FIXME: This does not work with overloaded functions because the macro + *`PYBIND11_ARG_TYPE(...)` cannot resolve correctly, should call + * `PYBIND11_ARG_TYPE(MSys, Interface, func)(&MSys::Interface::func)` on + * function registration if the macro resolves correctly */ +#define MSYS_BIND(m, func, ...) \ + m.def( \ + #func, &MSys::Interface::func, PYBIND11_ARG_NAME(MSys, Interface, func), \ + PyDoc_STR(PYBIND11_DOC(MSys, Interface, func)), ##__VA_ARGS__) +// TODO: this still cannot resolve PYBIND11_ARG_NAME & PYBIND11_DOC ambiguity +// #define MSYS_OVERLOAD_BIND(m, func, ...) m.def(#func, +// pybind11::overload_cast<__VA_ARGS__>(&MSys::Interface::func), +// PYBIND11_ARG_NAME(MSys, Interface, func), +// PyDoc_STR(PYBIND11_DOC(MSys, Interface, func))) +#define INTERFACE_DOCSTR PyDoc_STR(PYBIND11_DOC(PYBIND11, MODULE)) +#elif defined(PYBIND11_RICH_INTERFACE) +// Use a unmodified version of pybind11-mkdoc +// FIXME: This also does not work with overloaded functions +#define MSYS_BIND(m, func, ...) \ + m.def(#func, &MSys::Interface::func, PyDoc_STR(DOC(MSys, Interface, func)), ##__VA_ARGS__) +#define INTERFACE_DOCSTR (PyDoc_STR(DOC(PYBIND11, MODULE))) +#else +// No pybind11-mkdoc is found +// Fallback to simple binding of function name only +#define MSYS_BIND(m, func, ...) m.def(#func, &MSys::Interface::func, ##__VA_ARGS__) +#define INTERFACE_DOCSTR "" +#endif + +// Relies on external MSYS_MODNAME passed to build system +#ifndef MSYS_MODNAME +#error monitoring system name (MSYS_MODNAME) is not set +#endif +// wrapper to PYBIND11_MODULE that allows macro as name +#define PYBIND11_MODULE_WRAPPED(name, variable) PYBIND11_MODULE(name, variable) +// === Internal details END === + +/** + * Interface for System Performance Monitor + */ +PYBIND11_MODULE_WRAPPED(MSYS_MODNAME, m) { + m.doc() = INTERFACE_DOCSTR; + + // === Interface functions === + MSYS_BIND(m, initialize); + MSYS_BIND(m, getMonitoringSystem); + MSYS_BIND(m, addCPUMeterToSystem); + MSYS_BIND(m, addGPUMeterToSystem); + MSYS_BIND(m, addProcMeterToSystem); + MSYS_BIND(m, addDiskMeterToSystem); + MSYS_BIND(m, addMemMeterToSystem); + + MSYS_BIND(m, startRecording); + MSYS_BIND(m, stopRecording); + MSYS_BIND(m, reportStatus); + MSYS_BIND(m, testRun); +} diff --git a/monitoring_sys/src/logger.cc b/monitoring_sys/src/logger.cc new file mode 100644 index 0000000..6dc868e --- /dev/null +++ b/monitoring_sys/src/logger.cc @@ -0,0 +1,142 @@ +#include +#include +#include + +#include +#include +#include + +#include "date/date.h" +#include "date/tz.h" +#include "include/logger.hh" + +namespace MSys { + +namespace Detail { + +constexpr cr::duration flush_interval_seconds = cr::seconds(60); + +class FileLogSink : public absl::LogSink { + public: + explicit FileLogSink(const std::string &filename) : filename(filename) { + last_flush_time = cr::steady_clock::now(); + } + + ~FileLogSink() override { + if (log_file_.is_open()) { + log_file_.flush(); + log_file_.close(); + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + fprintf( + stderr, "[FileLogSink] Log file saved to %s:0 (at %s)\n", filename.c_str(), + date::format("%Y-%m-%d %H:%M:%S %z", date::make_zoned(date::current_zone(), now)) + .c_str()); + } + } + + void Send(const absl::LogEntry &entry) override { + std::lock_guard lock(mu_); + // lazy file allocation + if (!log_file_.is_open()) { + log_file_.open(filename, std::ios::out | std::ios::app); + assert(log_file_.is_open()); + } + log_file_ << entry.text_message_with_prefix_and_newline_c_str(); + + cr::steady_clock::time_point current_time = cr::steady_clock::now(); + if (current_time - last_flush_time >= flush_interval_seconds) { + log_file_.flush(); + last_flush_time = current_time; + } + } + + private: + const std::string filename; + std::ofstream log_file_; + std::mutex mu_; + + cr::steady_clock::time_point last_flush_time; +}; + +class Logger { + public: + static constexpr std::string_view log_filename = "libmsys.log"; + static constexpr std::string_view term_report_filename = "libmsys.term.log"; + + /** + * Assumes the input log path is a valid directory. + * + * @param log_dir directory to store logs + */ + Logger(const fs::path &log_dir) + : log_dir(log_dir), + log_file_path(log_dir / log_filename.data()), + term_report_file_path(log_dir / term_report_filename.data()) { + absl::InitializeLog(); + if (log_dir.empty()) { + absl::SetStderrThreshold(absl::LogSeverityAtLeast::kInfo); + LOG(INFO) << "[Logger] Initialized with no log directory, logging to stderr."; + } else { + file_sink = new FileLogSink(log_file_path); + absl::AddLogSink(file_sink); + absl::SetStderrThreshold(absl::LogSeverityAtLeast::kError); + } + } + + ~Logger() { + if (log_dir.empty()) return; + absl::RemoveLogSink(file_sink); + delete file_sink; + } + + const fs::path &getLoggerFolder() { return log_dir; } + + const fs::path &getLoggerFile() { return log_file_path; } + + const fs::path &getTermReportFile() { return term_report_file_path; } + + private: + const fs::path log_dir; + const fs::path log_file_path; + const fs::path term_report_file_path; + FileLogSink *file_sink = nullptr; +}; + +static Logger *logger = nullptr; + +} // namespace Detail + +// this is not thread safe +bool loggerInitialize(const std::string &log_dir) { + if (Detail::logger) return false; + + if (log_dir.empty()) { + Detail::logger = new Detail::Logger(fs::path()); + } else { + fs::path p = validateDir(log_dir); + if (p.empty()) { + LOG(ERROR) << absl::StrFormat("[Logger] Invalid log dir %s", log_dir.c_str()); + return false; + } + if (access(p.c_str(), W_OK)) { + LOG(ERROR) << absl::StrFormat("[Logger] Cannot write to log dir %s", p.c_str()); + return false; + } + + Detail::logger = new Detail::Logger(p); + } + return true; +} + +const fs::path &getLoggerFolder() { return Detail::logger->getLoggerFolder(); } + +const fs::path &getLoggerFile() { return Detail::logger->getLoggerFile(); } + +void loggerDeinitialize() { + if (Detail::logger) { + delete Detail::logger; + Detail::logger = nullptr; + } +} + +} // namespace MSys diff --git a/monitoring_sys/src/mem_meter.cc b/monitoring_sys/src/mem_meter.cc new file mode 100644 index 0000000..dd25ee6 --- /dev/null +++ b/monitoring_sys/src/mem_meter.cc @@ -0,0 +1,232 @@ +#include "include/mem_meter.hh" + +namespace MSys { + +namespace Detail { + +/** + * Memory information keys. + * @note Follow proto file definition order, not the field number order. + */ + +// Basic memory information keys. +static const std::vector mem_basic_info_keys = { + "MemTotal", + "MemFree", + "MemAvailable", +}; + +static const std::vector mem_kernel_cache_keys = { + "Buffers", + "Cached", + "SwapCached", +}; + +static const std::vector mem_active_inactive_keys = { + "Active", "Inactive", "Active(anon)", "Inactive(anon)", "Active(file)", "Inactive(file)", +}; + +static const std::vector mem_non_evictable_keys = { + "Unevictable", + "Mlocked", +}; + +static const std::vector mem_swap_keys = { + "SwapTotal", + "SwapFree", + "Zswap", + "Zswapped", +}; + +static const std::vector mem_dirty_writeback_keys = { + "Dirty", + "Writeback", +}; + +static const std::vector mem_type_keys = { + "AnonPages", + "Mapped", + "Shmem", +}; + +static const std::vector mem_kernel_keys = { + "KReclaimable", "Slab", "SReclaimable", "SUnreclaim", "KernelStack", "PageTables", +}; + +static const std::vector mem_tmp_buffer_keys = { + "NFS_Unstable", + "Bounce", + "WritebackTmp", +}; + +static const std::vector mem_virtual_keys = { + "CommitLimit", "Committed_AS", "VmallocTotal", "VmallocUsed", "VmallocChunk", +}; + +static const std::vector mem_huge_page_keys = { + "AnonHugePages", "ShmemHugePages", "ShmemPmdMapped", "FileHugePages", + "FilePmdMapped", "HugePages_Total", "HugePages_Free", "HugePages_Rsvd", + "HugePages_Surp", "Hugepagesize", "Hugetlb", +}; + +static const std::vector mem_direct_map_keys = { + "DirectMap4k", + "DirectMap2M", + "DirectMap4M", + "DirectMap1G", +}; + +static const std::vector mem_misc_keys = {"Percpu", "HardwareCorrupted"}; + +/* + * TODO: Due to proto3 cannot have constant specified in the file, the probe keys are listed in + * this file. This is somewhat not perfect as the keys are not visible to another languages. + * However, only the c++ code will parse through the actual files, so at this stage, just remember + * to change/add the field correspondence in this file when the corresponding proto file is changed. + */ +static const std::unordered_map> mem_info_keys_map = { + {MemMetadata::MEM_BASIC, mem_basic_info_keys}, + {MemMetadata::MEM_KERNEL_CACHE, mem_kernel_cache_keys}, + {MemMetadata::MEM_ACTIVE_INACTIVE, mem_active_inactive_keys}, + {MemMetadata::MEM_NON_EVICTABLE, mem_non_evictable_keys}, + {MemMetadata::MEM_SWAP, mem_swap_keys}, + {MemMetadata::MEM_DIRTY_WRITEBACK, mem_dirty_writeback_keys}, + {MemMetadata::MEM_TYPE, mem_type_keys}, + {MemMetadata::MEM_KERNEL, mem_kernel_keys}, + {MemMetadata::MEM_TMP_BUFFER, mem_tmp_buffer_keys}, + {MemMetadata::MEM_VIRTUAL, mem_virtual_keys}, + {MemMetadata::MEM_HUGE_PAGE, mem_huge_page_keys}, + {MemMetadata::MEM_DIRECT_MAP, mem_direct_map_keys}, + {MemMetadata::MEM_MISC, mem_misc_keys}, +}; + +class MemInfoMap { + public: + using KeyType = MemMetadata::Probe; + using ValueType = std::pair>; + + MemInfoMap() { + const proto::Descriptor *mem_info_metrics_desc = MemInfoMetrics::descriptor(); + + probe_info_map.reserve(mem_info_keys_map.size()); + for (const auto &pair : mem_info_keys_map) { + MemMetadata::Probe probe = pair.first; + const std::vector &keys = pair.second; + const proto::FieldDescriptor *field_desc = + mem_info_metrics_desc->FindFieldByNumber(static_cast(probe)); + if (!field_desc) return; + probe_info_map.emplace(probe, std::make_pair(field_desc->message_type(), keys)); + } + valid = true; + } + + bool isValid() const { return valid; } + + const std::unordered_map &getProbeInfoMap() const { return probe_info_map; } + + private: + std::unordered_map probe_info_map; + bool valid = false; +}; + +MemInfoMap mem_info_map; + +bool parseMemStat( + const std::vector &probes, std::unique_ptr &mem_info_repr, + MemInfoMetrics *mem_info_metrics) { + const proto::Reflection *reflection = mem_info_metrics->GetReflection(); + const proto::Descriptor *desc = mem_info_metrics->descriptor(); + + std::vector parsed_messages; + for (size_t probe_idx = 0; probe_idx < probes.size(); ++probe_idx) { + MemMetadata::Probe probe = probes[probe_idx]; + const proto::FieldDescriptor *probe_field_desc = + desc->FindFieldByNumber(static_cast(probe)); + parsed_messages.push_back(reflection->MutableMessage(mem_info_metrics, probe_field_desc)); + } + + bool ret = mem_info_repr->parseOnce(parsed_messages); + if (unlikely(!ret)) { + LOG(ERROR) << absl::StrFormat( + "[MemMeter] Failed to parse %s", mem_info_repr->getStatFilePath().c_str()); + return false; + } + return true; +} + +static std::string getProbeReport(const std::vector &probes) { + std::string report = "Enabled probe(s):"; + + if (probes.empty()) { + report += "\n N/A"; + return report; + } + + const proto::EnumDescriptor *probe_enum_desc = proto::GetEnumDescriptor(); + for (const MemMetadata::Probe &probe : probes) { + unsigned probe_value = static_cast(probe); + const proto::EnumValueDescriptor *value_desc = + probe_enum_desc->FindValueByNumber(probe_value); + report += absl::StrFormat( + "\n - %s.%s (%d)", probe_enum_desc->full_name().data(), value_desc->name().data(), + probe_value); + } + return report; +} + +} // namespace Detail + +MemMeter::MemMeter(cr::milliseconds tick_period, const std::vector &probes) + : Meter("MemMeter", tick_period, [] { return new MemMetricsTimeSeries(); }), probes(probes) { + const auto &mem_info_map = Detail::mem_info_map.getProbeInfoMap(); + if (!Detail::mem_info_map.isValid()) { + LOG(ERROR) << "[MemMeter] MemInfoMap failed to initialize"; + return; + } + + const proto::EnumDescriptor *probe_enum_desc = proto::GetEnumDescriptor(); + std::vector message_descs(probes.size()); + std::vector> key_lists(probes.size()); + for (size_t probe_idx = 0; probe_idx < probes.size(); ++probe_idx) { + MemMetadata::Probe probe = probes[probe_idx]; + + auto it = mem_info_map.find(probes[probe_idx]); + if (it == mem_info_map.end()) { + unsigned int probe_value = static_cast(probe); + const proto::EnumValueDescriptor *probe_field_desc = + probe_enum_desc->FindValueByNumber(probe_value); + LOG(ERROR) << absl::StrFormat( + "[MemMeter] Unsupported probe type: %s.%s (%d)", + probe_enum_desc->full_name().data(), + probe_field_desc ? probe_field_desc->name().data() : "", probe_value); + return; + } + message_descs[probe_idx] = it->second.first; + key_lists[probe_idx] = it->second.second; + } + + mem_info_repr = std::make_unique( + PROCMEMINFOFILE, message_descs, key_lists, "%64[^:]: %32s kB ", 64, 32); + + markValid(); +} + +bool MemMeter::update(bool testrun) { + UNUSED(testrun); + + MemMetrics *mem_info_metrics = getCurrentBuffer()->add_metrics(); + return Detail::parseMemStat(probes, mem_info_repr, mem_info_metrics->mutable_meminfo_metrics()); +} + +std::string MemMeter::getDetailedReport() const { + std::string report; + if (!mem_info_repr) { + report += "MemMeter not properly initialized."; + return report; + } + report += Detail::getProbeReport(probes); + report += "\n" + mem_info_repr->generateStatusReport(); + return report; +} + +} // namespace MSys \ No newline at end of file diff --git a/monitoring_sys/src/meter.cc b/monitoring_sys/src/meter.cc new file mode 100644 index 0000000..7f7c0c8 --- /dev/null +++ b/monitoring_sys/src/meter.cc @@ -0,0 +1,158 @@ +#include "include/meter.hh" + +#include + +namespace MSys { + +Meter::Meter( + const std::string &name, cr::milliseconds tick_period, + std::function stat_tser_factory, const std::string &file_suffix) + : name(name), + file_suffix(file_suffix), + tick_period(tick_period), + stat_tser(stat_tser_factory()), + stat_tser_dbuffer(stat_tser_factory()) { + if (!stat_tser || !stat_tser_dbuffer) { + LOG(FATAL) << absl::StrFormat( + "[Meter] %s stat_tser or stat_tser_dbuffer is null", name.c_str()); + } +} + +Meter::~Meter() { + // make sure writes are completed before destruction + std::default_delete deleter = async_write_ret.get_deleter(); + std::future *async_write_ret_ptr = async_write_ret.release(); + if (async_write_ret_ptr && async_write_ret_ptr->valid()) { + async_write_ret_ptr->wait(); + } + deleter(async_write_ret_ptr); + + // release all resources + close(fd); + google::protobuf::Message *stat_tser_dbuffer_inst = + stat_tser_dbuffer.exchange(nullptr, std::memory_order_acquire); + delete stat_tser; + delete stat_tser_dbuffer_inst; + LOG(INFO) << absl::StrFormat("[Meter] %s destructed", name.c_str()); +} + +void Meter::resetBuffer() noexcept { + stat_tser->Clear(); + stat_tser_dbuffer.load()->Clear(); +} + +ssize_t Meter::writeDataToFile(bool sync) noexcept { + if (unlikely(!stat_tser || stat_tser->ByteSizeLong() == 0)) return 0; + if (unlikely(!sync && fd < 0)) { + LOG(FATAL) << absl::StrFormat( + "[Meter] %s file descriptor is not set, cannot write data", name.c_str()); + return -1; + } + + if (unlikely(sync && fd < 0)) return -1; + + proto::Message *cur_dbuffer = stat_tser_dbuffer.exchange(nullptr, std::memory_order_acquire); + if (unlikely(!cur_dbuffer)) { + LOG(WARNING) << absl::StrFormat( + "[Meter] %s stat_tser_dbuffer is null, " + "last write have not yet returned", + name.c_str()); + return -1; + } + + proto::Message *cur_stat_tser = stat_tser; + stat_tser = cur_dbuffer; + + size_t current_msg_wire_size = cur_stat_tser->ByteSizeLong(); + + // Write the current stat_tser to the file descriptor asynchronously + // This is to avoid blocking the current thread + async_write_ret = std::make_unique>( + std::async(std::launch::async, [this, cur_stat_tser, current_msg_wire_size]() -> void { + // Do !!!NOT!!! touch stat_tser in this function, it is used by current + // thread + + // write header, which is the size of the current message in wire format + ssize_t msg_size_written_size = + write(fd, ¤t_msg_wire_size, sizeof(current_msg_wire_size)); + // write the message itself + bool success = cur_stat_tser->SerializeToFileDescriptor(fd); + + written_times++; + written_size += msg_size_written_size + current_msg_wire_size; + + // clear the buffer after the write is done + cur_stat_tser->Clear(); + + if (msg_size_written_size < 0 || !success) { + LOG(ERROR) << absl::StrFormat( + "[Meter] %s failed to write data to file descriptor %d " + "[error: %d (%s), proto error: %c]", + name.c_str(), fd, errno, strerror(errno), success ? 'y' : 'n'); + } + + // store the current stat_tser back to the atomic buffer to signal write + // completion + stat_tser_dbuffer.store(cur_stat_tser, std::memory_order_release); + })); + if (sync) { + // wait for the async write to finish if sync is true + LOG(INFO) << absl::StrFormat("[Meter] %s waiting for async func to finish", name.c_str()); + async_write_ret->wait(); + } + + return current_msg_wire_size; +} + +void Meter::fsyncDataToFile() noexcept { + if (fd < 0) return; + + if (fsync(fd) < 0) { + LOG(ERROR) << absl::StrFormat( + "[Meter] %s failed to fsync file descriptor %d, error: %d (%s)", name.c_str(), fd, + errno, strerror(errno)); + } +} + +void Meter::assignOutputDir(const fs::path &output_dir) { + file_path = output_dir / (name + file_suffix); + fd = open(file_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + LOG(FATAL) << absl::StrFormat( + "[Meter] %s failed to open file %s for writing, error: %d (%s)", name.c_str(), + file_path.string().c_str(), errno, strerror(errno)); + return; + } + + // convert the file path to a canonical path + std::error_code ec; + file_path = fs::weakly_canonical(file_path, ec); +} + +const fs::path &Meter::getOutputPath() const { return file_path; } + +size_t Meter::getWrittenTimes() const { return written_times; } + +size_t Meter::getWrittenSize() const { return written_size; } + +const std::string_view Meter::getName() const { return std::string_view(name); } + +const cr::milliseconds Meter::getTickPeriod() const { return tick_period; } + +size_t Meter::getCurrentMessageMemorySize() { + if (!stat_tser) return 0; + return stat_tser->SpaceUsedLong(); +} + +size_t Meter::getCurrentMessageSerializedSize() { + if (!stat_tser) return 0; + return stat_tser->ByteSizeLong(); +} + +std::string Meter::getDetailedReport() const { return ""; } + +bool Meter::isValid() const { return is_valid; } + +void Meter::markValid() { is_valid = true; } + +} // namespace MSys diff --git a/monitoring_sys/src/msys.cc b/monitoring_sys/src/msys.cc new file mode 100644 index 0000000..3e99904 --- /dev/null +++ b/monitoring_sys/src/msys.cc @@ -0,0 +1,665 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "include/msys.hh" + +namespace MSys { + +namespace Detail { + +// === Monitoring system registering +// RNG used to assign system IDs +static std::mt19937 +#ifdef STABLE_RANDOM + rng(0); +#else + rng((std::random_device())()); +#endif +std::uniform_int_distribution uni_dist(0, INT32_MAX); +// records all existing systems, not thread-safe +static std::unordered_map> existing_systems; +// determine if full system shutdown is needed +static bool systemOnceInitialized = false; + +#ifdef SCRAMBLE_SYSTEM_ID +inline int getNewSystemID() { + SystemID id; + + do { + id = uni_dist(rng); + } while (existing_systems.find(id) != existing_systems.end()); + + return id; +} +#else +static SystemID current_system_id = 0; +inline SystemID getNewSystemID() { return current_system_id++; } +#endif + +// === Process termination handler === +/** + * Exit all the running monitoring system gracefully and persist any in-memory + * records to avoid data loss + * + * @param normal whether the termination is normal + */ +inline void processTerminationHandler(bool normal) { + // nothing is initialized, so teardown is also not needed + if (!systemOnceInitialized) return; + + absl::LogSeverity severity = normal ? absl::LogSeverity::kInfo : absl::LogSeverity::kWarning; + + // halt all existing systems + LOG(LEVEL(severity)) << absl::StrFormat( + "[ProcTermHandler] Performing graceful termination, halting all existing MSys (count: %zu)", + existing_systems.size()); + for (auto pair : existing_systems) { + LOG(LEVEL(severity)) << absl::StrFormat( + "[ProcTermHandler] Halting MSys #%u (%s)", pair.first, pair.second->getSystemName()); + pair.second->halt(); + } + LOG(LEVEL(severity)) << "[ProcTermHandler] System Halted"; + + // Write termination report + + LOG(LEVEL(severity)) << "[ProcTermHandler] Termination complete"; + + // Everything involving logger is completed, deinitialize logger + loggerDeinitialize(); +} + +/** + * Graceful termination callback on signal caught + * + * @param signum signal number received by the program + */ +void processSigTerminationHandler(int signum) { + const char *signal_name = strsignal(signum); + LOG(ERROR) << absl::StrFormat( + "[SigHandler] Caught signal: %s (signum %d), " + "performing monitor termination", + signal_name ? signal_name : "", signum); + + // call for final termination + processTerminationHandler(false); + + // mark the signal as not handled and re-raise the signal + struct sigaction sa; + sa.sa_handler = SIG_DFL; + sigaction(signum, &sa, nullptr); + raise(signum); +} + +/** + * Graceful termination callback on normal system exit + */ +void processNormalTerminationHandler() { + // call for final termination + processTerminationHandler(true); +} + +constexpr auto terminable_signals = + std::array{SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGABRT, SIGFPE, SIGSEGV, SIGPIPE, SIGALRM, + SIGTERM, SIGUSR1, SIGUSR2, SIGBUS, SIGTRAP, SIGXCPU, SIGXFSZ, SIGSYS}; + +struct TerminationHandlerStaticInitializer final { + TerminationHandlerStaticInitializer() { + for (int sig : terminable_signals) { + struct sigaction sa {}; + sa.sa_handler = processSigTerminationHandler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(sig, &sa, nullptr); + } + + atexit(processNormalTerminationHandler); + } +}; +TerminationHandlerStaticInitializer handler_static_init; + +} // namespace Detail + +System::System( + SystemID id, const std::string &system_name, const fs::path &output_dir, + cr::milliseconds default_sample_period, size_t msg_write_size_threshold) + : system_id(id), + system_name(system_name), + output_dir(output_dir), + msg_write_size_threshold(msg_write_size_threshold), + default_sample_period(default_sample_period) { + LOG(INFO) << absl::StrFormat( + "[MSys] #%u (%s) initialized with " + "default sample period %ld ms, output dir %s", + system_id, getSystemName().data(), default_sample_period.count(), + output_dir.string().c_str()); +} + +System::~System() { LOG(INFO) << absl::StrFormat("[MSys] #%u destructed", system_id); } + +SystemID System::getSystemID() const { return system_id; } + +const std::string_view System::getSystemName() const { + return system_name.empty() ? system_default_name : std::string_view(system_name); +} + +bool System::addMeter(std::unique_ptr &&m) noexcept { + std::unique_lock lock(operation_status_mutex); + if (!in_operation) { + meter_list.push_back(std::move(m)); + return true; + } + return false; +} + +bool System::startRecording() { + std::unique_lock lock(operation_status_mutex); + if (!in_operation) { + // start the worker threads + + if (!isValid()) { + LOG(FATAL) << absl::StrFormat( + "[MSys] #%u (%s) has at least one invalid meter", system_id, + getSystemName().data()); + } + + // FIXME: currently we only support meters with the same sample period as + // the system default + for (const std::unique_ptr &meter : meter_list) { + if (meter->getTickPeriod() != default_sample_period) { + LOG(FATAL) << absl::StrFormat( + "[MSys] For system #%d (%s), meter %s has a tick period %d ms, " + "which is not equal to the system default sample period %d ms, " + "currently only supports meters with the same sample period as the " + "system default", + system_id, meter->getName().data(), meter->getTickPeriod().count(), + default_sample_period.count()); + // cannot reach here + } + } + + std::unordered_set output_files; + for (unsigned meter_idx = 0; meter_idx < meter_list.size(); meter_idx++) { + const std::unique_ptr &meter = meter_list[meter_idx]; + + meter->assignOutputDir(output_dir); + fs::path file_path = meter->getOutputPath(); + if (!output_files.insert(file_path.string()).second) { + LOG(FATAL) << absl::StrFormat( + "[MSys] Meter %s at index %d has the " + "same output file path as another meter", + meter->getName().data(), meter_idx); + // cannot reach here + } + } + + // create the worker info, which will spawn the worker threads + worker_info = std::make_unique(this, meter_list.size()); + + in_operation = true; + return true; + } + return false; +} + +bool System::stopRecording() noexcept { + std::unique_lock lock(operation_status_mutex); + if (in_operation) { + // halt the system and all worker threads + halt(); + + in_operation = false; + return true; + } + return false; +} + +bool System::isRecording() { + std::unique_lock lock(operation_status_mutex); + return in_operation; +} + +const fs::path &System::getOutputDir() const { return output_dir; } + +void System::reportStatus(bool verbose, bool detail) noexcept { + std::string report = ""; + + std::unique_lock lock(operation_status_mutex); + + absl::StrAppendFormat( + &report, "# === System Status Report on Instance #%u (%s) ===\n", system_id, + in_operation ? "In Operation" : "Not In Operation"); + + absl::StrAppendFormat(&report, " System Name: %s\n", getSystemName().data()); + absl::StrAppendFormat(&report, " Output Dir: %s\n", output_dir.string()); + absl::StrAppendFormat(&report, " Has #meter: %zu\n", meter_list.size()); + for (unsigned meter_idx = 0; meter_idx < meter_list.size(); meter_idx++) { + const std::unique_ptr &meter = meter_list[meter_idx]; + + size_t written_times = meter->getWrittenTimes(); + size_t written_size = meter->getWrittenSize(); + size_t cur_msg_wire_size = meter->getCurrentMessageSerializedSize(); + size_t cur_msg_mem_size = meter->getCurrentMessageMemorySize(); + + absl::StrAppendFormat(&report, " Meter #%-4d: %s\n", meter_idx, meter->getName().data()); + absl::StrAppendFormat( + &report, " Tick Period: %d ms\n", meter->getTickPeriod().count()); + absl::StrAppendFormat(&report, " Written times: %d times\n", written_times); + absl::StrAppendFormat( + &report, " Written size: %zu B (%.1f MB)\n", written_size, + (double)written_size / (1024 * 1024)); + absl::StrAppendFormat( + &report, " Msg wire size: %zu B (%.1f MB)\n", cur_msg_wire_size, + (double)cur_msg_wire_size / (1024 * 1024)); + // Show the memory size of the current message + absl::StrAppendFormat( + &report, " Msg mem size: %u B (%.1f MB)\n", cur_msg_mem_size, + (double)cur_msg_mem_size / (1024 * 1024)); + if (detail) { + std::string detail_report = meter->getDetailedReport(); + if (!detail_report.empty()) { + absl::StrAppendFormat( + &report, " Detailed Report:\n%s\n", indent(detail_report, " ").c_str()); + } else { + absl::StrAppendFormat(&report, " No detailed report available\n"); + } + } + } + absl::StrAppendFormat(&report, "# === Report END ==="); + + verbosePrint(verbose, "%s", report.c_str()); +} + +void System::resetAllBuffers() noexcept { + std::unique_lock lock(operation_status_mutex); + if (in_operation) { + LOG(ERROR) << absl::StrFormat( + "[MSys] #%u unexpected buffer reset called while in operation, refuse to take action", + system_id); + return; + } + + resetAllBuffersInternal(); +} + +void System::resetAllBuffersInternal() noexcept { + for (std::unique_ptr &meter : meter_list) { + meter->resetBuffer(); + } +} + +size_t System::getMsgWriteSizeThreshold() const { return msg_write_size_threshold; } + +const cr::milliseconds &System::getDefaultSamplePeriod() const { return default_sample_period; } + +bool System::isValid() const { + for (const std::unique_ptr &meter : meter_list) + if (!meter->isValid()) return false; + return true; +} + +bool System::testRun() { + { + std::unique_lock lock(operation_status_mutex); + if (in_operation) { + LOG(ERROR) << absl::StrFormat( + "[MSys] #%u (%s) cannot perform a test run when the system is already in operation", + system_id, getSystemName().data()); + return false; + } + in_operation = true; + } + + // Call this function before bails out, in_operation flag will not be reset properly otherwise. + auto terminate_test_run = [this]() -> void { + std::unique_lock lock(operation_status_mutex); + in_operation = false; + }; + + if (meter_list.empty()) { + LOG(ERROR) << absl::StrFormat( + "[MSys] #%u (%s) cannot perform a test run with no meters", system_id, + getSystemName().data()); + terminate_test_run(); + return false; + } + + if (!isValid()) { + std::vector meter_names; + constexpr std::string_view idx_header = "Idx"; + unsigned pad_length = + std::max(idx_header.size(), (size_t)std::ceil(std::log10(meter_list.size()) + 1)); + for (unsigned meter_idx = 0; meter_idx < meter_list.size(); meter_idx++) { + const std::unique_ptr &m = meter_list[meter_idx]; + meter_names.push_back( + strPad(meter_idx, pad_length) + ": " + std::string(m->getName().data()) + + (m->isValid() ? "" : " <= Invalid Meter")); + } + LOG(ERROR) << absl::StrFormat( + "[MSys] #%u (%s) has at least one invalid meter, cannot perform a test run. Detailed " + "reports:\n %s: MeterName\n %s", + system_id, getSystemName().data(), idx_header.data(), + strJoin(meter_names.begin(), meter_names.end(), "\n ").c_str()); + terminate_test_run(); + return false; + } + + for (unsigned meter_idx = 0; meter_idx < meter_list.size(); meter_idx++) { + const std::unique_ptr &meter = meter_list[meter_idx]; + if (!meter->isValid()) { + std::vector meter_names; + meter_names[meter_idx] += " <= Invalid Meter"; + const std::string meter_hints = strJoin(meter_names.begin(), meter_names.end(), "\n "); + LOG(ERROR) << absl::StrFormat( + "[MSys] #%u (%s) has invalid meter %s at index %d, cannot perform a test run", + system_id, getSystemName().data(), meter->getName().data(), meter_idx); + terminate_test_run(); + return false; + } + } + + bool ret = true; + LOG(INFO) << absl::StrFormat( + "[MSys] #%u (%s) test run started, will update all %zu meters", system_id, + getSystemName().data(), meter_list.size()); + resetAllBuffersInternal(); + + size_t msg_write_size_threshold = getMsgWriteSizeThreshold(); + size_t sample_period_ms = getDefaultSamplePeriod().count(); + size_t total_wire_size = 0; + for (unsigned meter_idx = 0; meter_idx < meter_list.size(); meter_idx++) { + std::unique_ptr &meter = meter_list[meter_idx]; + + LOG(INFO) << absl::StrFormat( + "[MSys] System #%u (%s) Meter #%u (%s) test run started", system_id, + getSystemName().data(), meter_idx, meter->getName().data()); + cr::time_point start = cr::steady_clock::now(); + bool meter_ret = meter->update(true); + cr::time_point end = cr::steady_clock::now(); + // calculate the duration of the update + cr::microseconds duration = cr::duration_cast(end - start); + + // get a rough idea of the meter write interval + size_t current_wire_size = meter->getCurrentMessageSerializedSize(); + size_t nwrites = + current_wire_size == 0 + ? 0 + : (msg_write_size_threshold + current_wire_size - 1) / current_wire_size; + double avg_write_interval_ms = + (double)sample_period_ms * msg_write_size_threshold / current_wire_size; + total_wire_size += current_wire_size; + + if (current_wire_size == 0) { + LOG(ERROR) << absl::StrFormat( + "[MSys] System #%u (%s) Meter #%u (%s) message wire size 0", system_id, + getSystemName().data(), meter_idx, meter->getName().data()); + meter_ret = false; + } + + if (meter_ret) { + LOG(INFO) << absl::StrFormat( + "[MSys] System #%u (%s) Meter #%u (%s) test run succeeded.\n" + " - Write threshold: %zu B (%.2f MB), Single write size: %zu B (%.2f " + "kB)\n" + " Avg write interval: %.2f ms (%.2f s, %.2f h), %zu writes " + "expected\n" + " - Update period: %ld ms, Actual update duration: %.3f ms (%.2f%%)", + system_id, getSystemName().data(), meter_idx, meter->getName().data(), + msg_write_size_threshold, (double)msg_write_size_threshold / (1024 * 1024), + current_wire_size, (double)current_wire_size / 1024, avg_write_interval_ms, + avg_write_interval_ms / 1000.0, avg_write_interval_ms / (1000 * 3600.0), nwrites, + sample_period_ms, duration.count() / 1000.0, + duration.count() / 1000.0 / sample_period_ms * 100.0); + } else { + LOG(ERROR) << absl::StrFormat( + "[MSys] System #%u (%s) Meter #%u (%s) test run FAILED", system_id, + getSystemName().data(), meter_idx, meter->getName().data()); + } + ret &= meter_ret; + } + double write_size_per_sec = (double)total_wire_size / sample_period_ms * 1000.0; + LOG(INFO) << absl::StrFormat( + "[MSys] System #%u (%s) test run finished, total wire size: %zu B (%.2f " + "MB), " + "write size per second: %.2f B/s (%.2f MB/s %.2f MB/h)", + system_id, getSystemName().data(), total_wire_size, (double)total_wire_size / (1024 * 1024), + write_size_per_sec, write_size_per_sec / (1024 * 1024), + write_size_per_sec / (1024 * 1024) * 3600); + + resetAllBuffersInternal(); + for (std::unique_ptr &meter : meter_list) { + if (meter->getCurrentMessageSerializedSize() > 0) { + LOG(FATAL) << absl::StrFormat( + "[MSys] Meter %s has non-empty message after test run", meter->getName().data()); + ret = false; + } + } + + terminate_test_run(); + return ret; +} + +bool System::update() noexcept { + bool ret = true; + for (std::unique_ptr &meter : meter_list) { + bool r = meter->update(); + if (!r) { + fprintf(stderr, "[MSys] Meter %s update FAILED\n", meter->getName().data()); + ret = false; + } + // ret &= meter->update(); + } + return ret; +} + +void System::halt() noexcept { + // Instantly wake up all the worker threads spawned and tell them to exit + // gracefully by calling the destructor of WorkerInfo + worker_info.reset(); + + // Persist all remaining data that registered in memory + for (const std::unique_ptr &meter : meter_list) { + // make the async function wait for completion before return + meter->writeDataToFile(true); + meter->fsyncDataToFile(); + } + + if (in_operation) + LOG(INFO) << absl::StrFormat("[MSys] #%u halted", system_id); + else + LOG(INFO) << absl::StrFormat("[MSys] #%u not in operation", system_id); +} + +WorkerInfo::WorkerInfo(System *system, unsigned nmeters) + : system(system), + worker_sync_point(nmeters + 1), // +1 for the coordinator thread + worker_stop(false), + meter_update_durations(nmeters, std::deque()), + meter_thread_finish_times(nmeters, 0), + system_creation_time(cr::steady_clock::now()), + coordinator_thread(&WorkerInfo::coordinator_thread_func, this) { + // create the worker threads, each thread will handle a subset of meters + for (unsigned meter_idx = 0; meter_idx < nmeters; meter_idx++) { + worker_threads.emplace_back(&WorkerInfo::worker_thread_func, this, meter_idx); + } + LOG(INFO) << absl::StrFormat( + "[MSys WorkerPool] Worker pool for MSys #%u constructed with %zu meters", + system->getSystemID(), nmeters); +} + +WorkerInfo::~WorkerInfo() { + // send a signal to attempt to stop all worker threads + worker_stop.store(true); + + LOG(INFO) << absl::StrFormat( + "[MSys WorkerPool] Stopping spawned threads for " + "MSys #%u, waiting for threads to join...", + system->system_id); + // wait for all worker threads to finish + coordinator_thread.join(); + for (std::thread &t : worker_threads) { + t.join(); + } + worker_threads.clear(); + + LOG(INFO) << absl::StrFormat( + "[MSys WorkerPool] Worker pool for MSys #%u destructed", system->getSystemID()); +} + +void WorkerInfo::coordinator_thread_func() { + unsigned long msg_write_size_threshold = system->getMsgWriteSizeThreshold(); + cr::time_point next_round_time = + system_creation_time + system->default_sample_period; + while (true) { + // wait for the signal to stop + std::this_thread::sleep_until(next_round_time); + + // exit the thread if stop signal is received + if (worker_stop.load()) { + // FIXME: discard a function labeled with [[nodiscard]] + (void)worker_sync_point.arrive_and_drop(); + break; + } + + // notify all worker threads to start a new round of profiling + worker_sync_point.arrive_and_wait(); + + // wait for all worker threads to finish their work in this round + worker_sync_point.arrive_and_wait(); + + for (const std::unique_ptr &meter : system->meter_list) { + if (meter->getCurrentMessageSerializedSize() >= msg_write_size_threshold) { + meter->writeDataToFile(); + } + } + + cr::time_point round_finish_time = cr::steady_clock::now(); + next_round_time += system->default_sample_period; + + cr::milliseconds time_remaining = + cr::duration_cast(next_round_time - round_finish_time); + + const double warning_frac = 0.1; // 10% of the sample period + int time_remaining_ms = cr::duration_cast(time_remaining).count(); + unsigned default_sample_period_ms = + cr::duration_cast(system->default_sample_period).count(); + + if (time_remaining_ms < warning_frac * default_sample_period_ms) { + LOG(WARNING) << absl::StrFormat( + "[MSys WorkerPool] Coordinator thread for MSys #%u (%s): " + "Next round time %ld ms is too close to the current round finish time %ld ms. " + "Only %ld ms remaining, less than %.2f%% of the sample period (%ld ms). " + "Consider increasing the sample period.", + system->getSystemID(), system->getSystemName().data(), + cr::duration_cast(next_round_time.time_since_epoch()).count(), + cr::duration_cast(round_finish_time.time_since_epoch()).count(), + time_remaining_ms, (double)time_remaining_ms / default_sample_period_ms * 100.0, + default_sample_period_ms); + } + } +} + +void WorkerInfo::worker_thread_func(const unsigned meter_idx) { + const std::unique_ptr &meter = system->meter_list[meter_idx]; + + while (true) { + // wait for coordination signal + worker_sync_point.arrive_and_wait(); + + // exit the thread if stop signal is received + if (worker_stop.load()) { + worker_sync_point.arrive_and_drop(); + break; + } + + cr::time_point start = cr::high_resolution_clock::now(); + meter->update(); + cr::time_point end = cr::high_resolution_clock::now(); + + meter_update_durations[meter_idx].push_back( + std::chrono::duration_cast(end - start).count()); + + meter_thread_finish_times[meter_idx] = + cr::high_resolution_clock::now().time_since_epoch().count(); + + // notify the coordinator thread that this worker thread has finished + worker_sync_point.arrive_and_wait(); + } +} + +bool msysInitialize(const std::string &log_dir) { + Detail::systemOnceInitialized = true; + return loggerInitialize(log_dir); +} + +// FIXME: currently it seems with templated function called from interface an +// undefined symbol error will be raised when importing the library. Hide this +// templated implementation now and use a explicit one. +// template +// SystemID constructNewSystem(Args &&...args) { +// using Detail::existing_systems, Detail::system_index_generator; + +// SystemID idx; +// do { +// idx = system_index_generator(); +// } while (existing_systems.find(idx) != existing_systems.end()); +// existing_systems.emplace( +// idx, std::make_shared(std::forward(args)...) +// ); +// return idx; +// } + +/** + * Construct a monitoring system and return its index as an identifier to access + * that instance + * + * @note this method is NOT thread-safe + * @return an ID associated with the system + */ +SystemID constructNewSystem( + const std::string &output_dir, unsigned default_sample_period_ms, + const std::string &system_name, const size_t msg_write_size_threshold) { + using Detail::existing_systems; + + SystemID id = Detail::getNewSystemID(); + + fs::path output_dir_path = validateDir(output_dir); + if (output_dir_path.empty()) return invalidSystemID; + + existing_systems.emplace( + id, std::make_shared( + id, system_name, output_dir_path, cr::milliseconds{default_sample_period_ms}, + msg_write_size_threshold)); + return id; +} + +std::shared_ptr retrieveSystemUsingIndex(SystemID id) { + using Detail::existing_systems; + + auto result = existing_systems.find(id); + if (result == existing_systems.end()) { + return nullptr; + } + return result->second; +} + +bool msysTestRun(SystemID id) { + std::shared_ptr system = retrieveSystemUsingIndex(id); + if (!system) return false; + + bool ret = system->update(); + system->resetAllBuffers(); + return ret; +} + +} // namespace MSys diff --git a/monitoring_sys/src/proc_meter.cc b/monitoring_sys/src/proc_meter.cc new file mode 100644 index 0000000..7dd1427 --- /dev/null +++ b/monitoring_sys/src/proc_meter.cc @@ -0,0 +1,276 @@ +#include +#include +#include + +#include "include/proc_meter.hh" + +namespace MSys { + +enum class Probes { + STAT, // /proc//stat + STATM, // /proc//statm + IO, // /proc//io +}; + +namespace Detail { + +/** + * Open a file for reading with the given filename format and process ID. + * + * @note snprintf insufficient buf length is translated to failed open + * @tparam Buflen length of the buffer to hold the file path + * @param filename_format format string for the filename, should be + * @param pid process ID to substitute into the filename format + * @return FILE pointer to the opened file, can be nullptr if open failed + */ +template +FILE *openFileForRead(const char *filename_format, int pid) { + char path[Buflen]; + snprintf(path, sizeof(path), filename_format, pid); + FILE *fp = fopen(path, "r"); + return fp; +} + +/** + * Open a file in /proc/ for reading. + * + * @param file file name to open, should contain the leading slash + * @param pid process ID to substitute into the file path + */ +#define OPEN_PROC_PID_FILE_FOR_READ(file, pid) \ + openFileForRead( \ + PROCDIR "/%d" file, pid) + +// Refer to https://man7.org/linux/man-pages/man5/proc_pid_stat.5.html +static const char *proc_pid_stat_format = + "%*d " // (1) [NT] pid %d + "(%*[^)]) " // (2) [NT] comm %s + "%c " // (3) [1] state %c + "%*d " // (4) [NT] ppid %d + "%*d " // (5) [NT] pgrp %d + "%*d " // (6) [NT] session %d + "%*d " // (7) [NT] tty_nr %d + "%*d " // (8) [NT] tpgid %d + "%*u " // (9) [NT] flags %u + "%lu " // (10) [2] minflt %lu + "%lu " // (11) [3] cminflt %lu + "%lu " // (12) [4] majflt %lu + "%lu " // (13) [5] cmajflt %lu + "%lu " // (14) [6] utime %lu + "%lu " // (15) [7] stime %lu + "%ld " // (16) [8] cutime %ld + "%ld " // (17) [9] cstime %ld + "%ld " // (18) [10] priority %ld + "%ld " // (19) [11] nice %ld + "%ld " // (20) [12] num_threads %ld + "%*ld " // (21) [NT] itrealvalue %ld + "%*llu " // (22) [NT] starttime %llu + "%lu " // (23) [13] vsize %lu + ; /** fields after vsize are NT because they are not relevant for + process resource monitoring */ + +static inline bool parseProcPIDStat(int pid, google::protobuf::Message *const pid_stat_msg) { + ProcPIDStatMetrics *const pid_stat_metrics = dynamic_cast(pid_stat_msg); + if (unlikely(!pid_stat_metrics)) { + LOG(ERROR) << absl::StrFormat( + "[ProcMeter] Invalid ProcPIDStatMetrics pointer for pid %d", pid); + return false; + } + + FILE *fp = OPEN_PROC_PID_FILE_FOR_READ(STATFILE, pid); + if (unlikely(!fp)) { + LOG(ERROR) << absl::StrFormat( + "[ProcMeter] Failed to open %s for pid %d: %s", STATFILE, pid, strerror(errno)); + return false; + } + + char state; + unsigned long minflt, cminflt, majflt, cmajflt, utime, stime; + long cutime, cstime, priority, nice, num_threads; + unsigned long vsize; + + int ret = fscanf( + fp, proc_pid_stat_format, &state, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime, + &cutime, &cstime, &priority, &nice, &num_threads, &vsize); + if (unlikely(ret < 12)) { + LOG(WARNING) << absl::StrFormat( + "[ProcMeter] Failed to parse %s for pid %d: expected 12 fields, got %d", STATFILE, pid, + ret); + } + + pid_stat_metrics->set_state(state); + pid_stat_metrics->set_minflt(minflt); + pid_stat_metrics->set_cminflt(cminflt); + pid_stat_metrics->set_majflt(majflt); + pid_stat_metrics->set_cmajflt(cmajflt); + pid_stat_metrics->set_utime(utime); + pid_stat_metrics->set_stime(stime); + pid_stat_metrics->set_cutime(cutime); + pid_stat_metrics->set_cstime(cstime); + pid_stat_metrics->set_priority(priority); + pid_stat_metrics->set_nice(nice); + pid_stat_metrics->set_num_threads(num_threads); + pid_stat_metrics->set_vsize(vsize); + + fclose(fp); + return true; +} + +static const char *proc_pid_statm_format = + "%lu " // (1) [1] size %lu + "%lu " // (2) [2] resident %lu + "%lu " // (3) [3] share %lu + "%lu " // (4) [4] text %lu + "%lu " // (5) [5] lib %lu + "%lu " // (6) [6] data %lu + "%lu " // (7) [7] dt %lu + ; /** fields after dt are NT because they are not relevant for + process resource monitoring */ + +static inline bool parseProcPIDStatm(int pid, ProcPIDStatmMetrics *const pid_statm_msg) { + ProcPIDStatmMetrics *const pid_statm_metrics = + dynamic_cast(pid_statm_msg); + if (unlikely(!pid_statm_metrics)) { + LOG(ERROR) << absl::StrFormat( + "[ProcMeter] Invalid ProcPIDStatmMetrics pointer for pid %d", pid); + return false; + } + + FILE *fp = OPEN_PROC_PID_FILE_FOR_READ(STATMFILE, pid); + if (unlikely(!fp)) { + LOG(ERROR) << absl::StrFormat( + "[ProcMeter] Failed to open %s for pid %d: %s", STATMFILE, pid, strerror(errno)); + return false; + } + + unsigned long size, resident, shared, text, lib, data, dt; + int nfields = + fscanf(fp, proc_pid_statm_format, &size, &resident, &shared, &text, &lib, &data, &dt); + if (unlikely(nfields < 7)) { + LOG(WARNING) << absl::StrFormat( + "[ProcMeter] Failed to parse %s for pid %d: expected 7 fields, got %d", STATMFILE, pid, + nfields); + } + + pid_statm_metrics->set_size(size); + pid_statm_metrics->set_resident(resident); + pid_statm_metrics->set_share(shared); + pid_statm_metrics->set_text(text); + pid_statm_metrics->set_lib(lib); + pid_statm_metrics->set_data(data); + pid_statm_metrics->set_dt(dt); + + fclose(fp); + return true; +} + +static const char *proc_pid_io_format = + "rchar: %lu " // (1) [1] read chars %lu + "wchar: %lu " // (2) [2] written chars %lu + "syscr: %lu " // (3) [3] read syscalls %lu + "syscw: %lu " // (4) [4] write syscalls %lu + "read_bytes: %lu " // (5) [5] read bytes %lu + "write_bytes: %lu " // (6) [6] written bytes %lu + "cancelled_write_bytes: %lu " // (7) [7] cancelled write bytes %lu + ; + +static inline bool parseProcPIDIO(int pid, ProcPIDIOMetrics *const pid_io_msg) { + ProcPIDIOMetrics *const pid_io_metrics = dynamic_cast(pid_io_msg); + if (unlikely(!pid_io_metrics)) { + LOG(ERROR) << absl::StrFormat( + "[ProcMeter] Invalid ProcPIDIOMetrics pointer for pid %d", pid); + return false; + } + + FILE *fp = OPEN_PROC_PID_FILE_FOR_READ(IOFILE, pid); + if (unlikely(!fp)) { + LOG(ERROR) << absl::StrFormat( + "[ProcMeter] Failed to open %s for pid %d: %s", IOFILE, pid, strerror(errno)); + return false; + } + + unsigned long rchar, wchar, syscr, syscw, read_bytes, write_bytes, cancelled_write_bytes; + int nfields = fscanf( + fp, proc_pid_io_format, &rchar, &wchar, &syscr, &syscw, &read_bytes, &write_bytes, + &cancelled_write_bytes); + if (unlikely(nfields < 7)) { + LOG(WARNING) << absl::StrFormat( + "[ProcMeter] Failed to parse %s for pid %d: expected 7 fields, got %d", IOFILE, pid, + nfields); + } + + pid_io_metrics->set_rchar(rchar); + pid_io_metrics->set_wchar(wchar); + pid_io_metrics->set_syscr(syscr); + pid_io_metrics->set_syscw(syscw); + pid_io_metrics->set_read_bytes(read_bytes); + pid_io_metrics->set_write_bytes(write_bytes); + pid_io_metrics->set_cancelled_write_bytes(cancelled_write_bytes); + + fclose(fp); + return true; +} + +} // namespace Detail + +ProcMeter::ProcMeter( + cr::milliseconds tick_period, const std::vector &pids, + const std::vector &probes) + : Meter("ProcMeter", tick_period, [] { return new ProcMetricsTimeSeries(); }), + pids(pids), + probes(probes.begin(), probes.end()) { + if (pids.empty()) { + LOG(ERROR) << "[ProcMeter] No PIDs provided for ProcMeter"; + return; + } + + if (probes.empty()) { + LOG(ERROR) << "[ProcMeter] No probes provided for ProcMeter"; + return; + } + + markValid(); +} + +bool ProcMeter::update(bool testrun) { + UNUSED(testrun); + + ProcMetrics *proc_metrics = getCurrentBuffer()->add_metrics(); + bool ret = true; + for (const pid_t pid : pids) { + PerProcMetrics *per_proc_metrics = proc_metrics->add_per_proc_metrics(); + // FIXME: current way of iterating is not efficient enough + // STAT + if (probes.find(ProcMetadata::STAT) != probes.end()) + ret &= Detail::parseProcPIDStat(pid, per_proc_metrics->mutable_pid_stat_metrics()); + // STATM + if (probes.find(ProcMetadata::STATM) != probes.end()) + ret &= Detail::parseProcPIDStatm(pid, per_proc_metrics->mutable_pid_statm_metrics()); + // IO + if (probes.find(ProcMetadata::IO) != probes.end()) + ret &= Detail::parseProcPIDIO(pid, per_proc_metrics->mutable_pid_io_metrics()); + } + return ret; +} + +std::string ProcMeter::getDetailedReport() const { + std::string report; + report += absl::StrFormat("Monitored PIDs:"); + for (const auto &pid : pids) { + report += absl::StrFormat("\n - %d", pid); + } + + report += "\nEnabled probe(s):"; + const proto::EnumDescriptor *probe_enum_desc = proto::GetEnumDescriptor(); + for (const auto &probe : probes) { + unsigned probe_value = static_cast(probe); + const proto::EnumValueDescriptor *value_desc = + probe_enum_desc->FindValueByNumber(probe_value); + report += absl::StrFormat( + "\n - %s.%s (%d)", probe_enum_desc->full_name().data(), value_desc->name().data(), + probe_value); + } + return report; +} + +} // namespace MSys diff --git a/monitoring_sys/src/utils.cc b/monitoring_sys/src/utils.cc new file mode 100644 index 0000000..4104677 --- /dev/null +++ b/monitoring_sys/src/utils.cc @@ -0,0 +1,422 @@ +#include +#include +#include +#include +#include +#include + +#include "include/utils.hh" + +static const date::time_zone *current_tz = date::current_zone(); + +unsigned getSystemNProc() { return sysconf(_SC_NPROCESSORS_ONLN); } + +unsigned getSystemPageSize() { return sysconf(_SC_PAGESIZE); } + +// Jiffies warp around in 2^32 / HZ / 86400 = 497 days with HZ = 100 (typical) + +unsigned getSystemHz() { return sysconf(_SC_CLK_TCK); } + +cr::nanoseconds nsSinceEpoch() { + return cr::duration_cast(cr::steady_clock::now().time_since_epoch()); +} + +fs::path validateDir(const std::string &dir) { + std::error_code ec; + fs::path p = fs::weakly_canonical(dir, ec); + if (ec.value() == 0) return p; + return fs::path(); +} + +std::string getCurrentTime(const cr::system_clock::time_point &p, const std::string &time_format) { + date::zoned_time zoned_time = date::zoned_time(current_tz, p); + return date::format(time_format, zoned_time); +} +std::string indent(const std::string &input, const std::string &prefix) { + std::istringstream iss(input); + std::ostringstream oss; + std::string line; + + bool first = true; + while (std::getline(iss, line)) { + if (!first) oss << '\n'; + oss << prefix << line; + first = false; + } + + return oss.str(); +} + +const std::regex scanf_field_format( + R"(%[0 #+-]?\d*\.?\d*([hl]{0,2}|[jztL])?([diuoxXeEfgGaAcpsSn%]|\[[^\[\]]+\]))"); +const std::regex scanf_string_field_format(R"(%\d*(s|\[[^\[\]]+\]))"); + +/** + * @brief Get the number of scanf string fields in a format string. + * @see https://stackoverflow.com/questions/45215648/regex-capture-type-specifiers-in-format-string + * @note The regex is adapted to match for scanf formats. The major change includes + * 1) Ignoring the asterisk (*) in the format string as it means skipping the field in scanf. + * 2) Add matching for scanset and negated scanset (e.g., %[abc] and %[^abc]) for strings. + * @param format The format string to analyze. + * @return The number of scanf string fields found in the string, excluding ignored fields. + */ +static unsigned getNFormatFields(const std::string &format) { + std::regex re(R"(%[0 #+-]?\d*\.?\d*([hl]{0,2}|[jztL])?([diuoxXeEfgGaAcpsSn%]|\[[^\[\]]+\]))"); + auto begin = std::sregex_iterator(format.begin(), format.end(), re); + auto end = std::sregex_iterator(); + return std::distance(begin, end); +} + +/** + * @brief Get the number of scanf string fields in a format string. + * @see https://stackoverflow.com/questions/45215648/regex-capture-type-specifiers-in-format-string + * @note The regex is adapted to match for scanf formats. The major change includes + * 1) Only match format options for strings (i.e., %\d*s, scanset, and negated scanset). + * @param format The format string to analyze. + * @return The number of scanf string fields found in the string, excluding ignored fields. + */ +static unsigned getNStringFormatFields(const std::string &format) { + std::regex re(R"(%\d*(s|\[[^\[\]]+\]))"); + auto begin = std::sregex_iterator(format.begin(), format.end(), re); + auto end = std::sregex_iterator(); + return std::distance(begin, end); +} + +/** + * @brief Generate a fast scanf format string from a given scanf format by ignoring the key field. + * @param field_scanf_format The original scanf format string. + * @return A modified scanf format string that only matches the value. + */ +static std::string generateFastScanfFormat(const std::string &field_scanf_format) { + std::string field_fast_scanf_format = field_scanf_format; + const std::regex pattern(R"(%\d*(s|\[[^\[\]]+\]))"); + std::smatch match; + + if (!std::regex_search(field_fast_scanf_format, match, pattern) || match.size() < 1) { + field_fast_scanf_format.clear(); + } else { + std::string replacement = "%*" + std::string(match[1]); + field_fast_scanf_format.replace(match.position(0), match.length(0), replacement); + } + + return field_fast_scanf_format; +} + +/** + * @brief Get hint information for a message and its key list. + * @note This function is expensive because it format strings in a pretty way. + * @param msg The protobuf message. + * @param key_list The list of keys associated with the message. + * @return {full name of the message, a string representation of the key list}. + */ +static std::pair getHintInfo( + const proto::Descriptor *msg_desc, const std::vector &key_list) { + std::vector field_names; + for (int i = 0; i < msg_desc->field_count(); ++i) { + const proto::FieldDescriptor *field_desc = msg_desc->field(i); + field_names.push_back(field_desc->name().data()); + } + const std::string message_hint = + "(" + std::string(msg_desc->full_name()) + "): " + + (field_names.size() ? strJoin(field_names.begin(), field_names.end(), ", ") : ""); + const std::string key_hint = + key_list.size() ? strJoin(key_list.begin(), key_list.end(), ", ") : ""; + return {std::string(message_hint), key_hint}; +} + +KVRepr::KVRepr( + const fs::path &stat_file_path, const std::vector &message_descs, + const std::vector> &key_lists, const std::string &field_scanf_format, + const unsigned key_field_max_length, const unsigned val_field_max_length) + : stat_file_path(stat_file_path), + message_descs(message_descs), + key_lists(key_lists), + field_scanf_format(field_scanf_format), + key_field_max_length(key_field_max_length), + val_field_max_length(val_field_max_length) { + // exactly two string fields and no other type of fields are expected in the scanf format + unsigned scanf_nfields = getNFormatFields(field_scanf_format); + unsigned scanf_n_string_fields = getNStringFormatFields(field_scanf_format); + if (scanf_nfields != scanf_n_string_fields || scanf_nfields != 2) { + LOG(ERROR) << absl::StrFormat( + "[KVRepr] Expect exactly two string fields in scanf format, get \"%s\" (%u fields, %u " + "string fields)", + field_scanf_format, scanf_nfields, scanf_n_string_fields); + return; + } + + // newline characters are not allowed in the scanf format + if (field_scanf_format.find('\n') != std::string::npos) { + LOG(ERROR) << absl::StrFormat( + "[KVRepr] Newline characters are not allowed in scanf format \"%s\"", + field_scanf_format); + return; + } + + if (message_descs.size() != key_lists.size()) { + const auto [message_hint, key_hint] = getHintInfo(message_descs[0], key_lists[0]); + LOG(ERROR) << absl::StrFormat( + "[KVRepr] Number of messages (%zu) and key_lists (%zu) do not match. Initialized with\n" + " messages[0]: %s\n" + " key_lists[0]: %s", + message_descs.size(), key_lists.size(), message_hint, key_hint); + return; + } + for (unsigned msg_idx = 0; msg_idx < message_descs.size(); ++msg_idx) { + const proto::Descriptor *msg_desc = message_descs[msg_idx]; + const std::vector &key_list = key_lists[msg_idx]; + + size_t msg_nfields = msg_desc->field_count(); + size_t key_list_nfields = key_list.size(); + if (msg_nfields != key_list_nfields) { + const auto [message_hint, key_hint] = getHintInfo(msg_desc, key_list); + LOG(ERROR) << absl::StrFormat( + "[KVRepr] Length of message (%zu) and key_list (%zu) do not match at message index " + "%u. Initialized with\n" + " messages[%u]: %s\n" + " key_lists[%u]: %s", + msg_nfields, key_list_nfields, msg_idx, msg_idx, message_hint, msg_idx, key_hint); + return; + } + } + + FILE *const fp = fopen(stat_file_path.c_str(), "r"); + if (!fp) { + LOG(ERROR) << absl::StrFormat( + "[KVRepr] Failed to open file %s: %s", stat_file_path.string(), strerror(errno)); + return; + } + + std::unordered_map key_to_line_idx; + int line_idx = 0; + do { + char key_buffer[key_field_max_length + 1]; + char val_buffer[val_field_max_length + 1]; + int nfields = fscanf(fp, field_scanf_format.c_str(), key_buffer, val_buffer); + UNUSED(val_buffer); + + std::string key_string(key_buffer, strnlen(key_buffer, key_field_max_length)); + if (unlikely(nfields != 2)) { + LOG(ERROR) << absl::StrFormat( + "[KVRepr] Failed to parse line in file %s with format \"%s\". " + "Expected 2 fields, got %d. Key: \"%s\"", + stat_file_path.c_str(), field_scanf_format.c_str(), nfields, key_string.c_str()); + } + + key_to_line_idx[key_string] = line_idx; + line_idx++; + } while (!feof(fp)); + + for (size_t msg_idx = 0; msg_idx < message_descs.size(); ++msg_idx) { + const proto::Descriptor *msg_desc = message_descs[msg_idx]; + const std::vector &key_list = key_lists[msg_idx]; + + unsigned nfields = msg_desc->field_count(); + for (unsigned field_idx = 0; field_idx < nfields; ++field_idx) { + const std::string &key = key_list[field_idx]; + auto it = key_to_line_idx.find(key); + if (it == key_to_line_idx.end()) { + // tolerate missing keys + LOG(WARNING) << absl::StrFormat( + "[KVRepr] Key \"%s\" not found in file %s for message \"%s\" at index %zu", key, + stat_file_path.string(), msg_desc->full_name(), msg_idx); + missing_fields.emplace_back(msg_idx, field_idx); + continue; + } + unsigned line_idx = it->second; + kv_map.emplace(line_idx, std::make_pair(msg_idx, field_idx)); + } + } + + // generate fast scanf format by converting the first %s in the format to %*s + field_fast_scanf_format = generateFastScanfFormat(field_scanf_format); + if (field_fast_scanf_format.empty()) { + LOG(ERROR) << absl::StrFormat( + "[KVRepr] Failed to generate fast scanf format from \"%s\". Cannot proceed with " + "parsing.", + field_scanf_format); + fclose(fp); + return; + } + + valid = true; +} + +static bool setProtoFieldFromString( + const char *value_str, proto::Message *const message, unsigned field_idx) { + const proto::FieldDescriptor *field_desc = message->GetDescriptor()->field(field_idx); + const proto::Reflection *reflection = message->GetReflection(); + + std::from_chars_result result; + const char *start = value_str; + const char *end = value_str + strlen(value_str); + switch (field_desc->cpp_type()) { + case proto::FieldDescriptor::CPPTYPE_INT64: { + int64_t value; + result = std::from_chars(start, end, value); + reflection->SetInt64(message, field_desc, value); + break; + } + case proto::FieldDescriptor::CPPTYPE_INT32: { + int32_t value; + result = std::from_chars(start, end, value); + reflection->SetInt32(message, field_desc, value); + break; + } + case proto::FieldDescriptor::CPPTYPE_UINT64: { + uint64_t value; + result = std::from_chars(start, end, value); + reflection->SetUInt64(message, field_desc, value); + break; + } + case proto::FieldDescriptor::CPPTYPE_UINT32: { + uint32_t value; + result = std::from_chars(start, end, value); + reflection->SetUInt32(message, field_desc, value); + break; + } + case proto::FieldDescriptor::CPPTYPE_DOUBLE: { + double value; + result = std::from_chars(start, end, value); + reflection->SetDouble(message, field_desc, value); + break; + } + case proto::FieldDescriptor::CPPTYPE_FLOAT: { + float value; + result = std::from_chars(start, end, value); + reflection->SetFloat(message, field_desc, value); + break; + } + default: { + LOG(ERROR) << absl::StrFormat( + "Unsupported field type %s for message \"%s\" field #%u \"%s\". " + "Only numeric fields are supported.", + proto::FieldDescriptor::CppTypeName(field_desc->cpp_type()), + message->GetDescriptor()->full_name(), field_idx, field_desc->name()); + return false; + } + } + + if (unlikely(result.ec != std::errc())) { + LOG(ERROR) << absl::StrFormat( + "Failed to parse value \"%s\" for message \"%s\" field #%u \"%s\". " + "Error: %s", + value_str, message->GetDescriptor()->full_name(), field_idx, field_desc->name(), + std::make_error_code(result.ec).message()); + return false; + } + return true; +} + +bool KVRepr::parseOnce(std::vector &parsed_messages) const { + if (unlikely(!valid)) { + LOG(ERROR) << "KVRepr is not valid. Cannot parse messages."; + return false; + } + + if (unlikely(parsed_messages.size() != message_descs.size())) { + LOG(ERROR) << absl::StrFormat( + "Number of parsed messages (%zu) does not match number of message descriptors (%zu). " + "Cannot parse messages.", + parsed_messages.size(), message_descs.size()); + return false; + } + + FILE *const fp = fopen(stat_file_path.c_str(), "r"); + if (!fp) { + LOG(ERROR) << absl::StrFormat( + "Failed to open file %s: %s", stat_file_path.string(), strerror(errno)); + return false; + } + + unsigned current_line = 0; + for (auto &next_field : kv_map) { + unsigned line_idx = next_field.first; + unsigned msg_idx = next_field.second.first; + unsigned field_idx = next_field.second.second; + + // skip lines until we reach the desired line index + while (current_line < line_idx) { + // NOTE: Ignoring the return value of a function labeled with [[nodiscard]] + int result = fscanf(fp, "%*[^\n] "); + if (unlikely(result == EOF)) { + LOG(ERROR) << absl::StrFormat( + "Unexpected end of file while reading line %u for message \"%s\" field #%u", + current_line, parsed_messages[msg_idx]->GetDescriptor()->full_name(), + field_idx); + fclose(fp); + return false; + } + current_line++; + } + + char val_buffer[val_field_max_length + 1]; + int nfields = fscanf(fp, field_fast_scanf_format.c_str(), val_buffer); + // report error on EOF + if (unlikely(nfields == EOF)) { + LOG(ERROR) << absl::StrFormat( + "Failed to read line %u for message \"%s\" field #%u", current_line, + parsed_messages[msg_idx]->GetDescriptor()->full_name(), field_idx); + fclose(fp); + return false; + } + + bool ret = setProtoFieldFromString(val_buffer, parsed_messages[msg_idx], field_idx); + if (unlikely(!ret)) { + LOG(ERROR) << absl::StrFormat( + "Failed to parse line %u for message \"%s\" field #%u", current_line, + parsed_messages[msg_idx]->GetDescriptor()->full_name(), field_idx); + fclose(fp); + return false; + } + current_line++; + } + + fclose(fp); + return true; +} + +bool KVRepr::isValid() const { return valid; } + +const fs::path &KVRepr::getStatFilePath() const { return stat_file_path; } + +std::string KVRepr::generateStatusReport() const { + std::string ret; + if (!isValid()) { + absl::StrAppendFormat(&ret, "Invalid KVRepr instance."); + } else { + absl::StrAppendFormat( + &ret, + "KVRepr on input file %s" + "\n Generic:" + "\n - Generated fast scanf format: \"%s\" (adapted from original format \"%s\")" + "\n - Number of messages: %zu" + "\n Fields (%zu found, %zu missing):", + stat_file_path.string(), field_fast_scanf_format.c_str(), field_scanf_format.c_str(), + message_descs.size(), kv_map.size(), missing_fields.size()); + for (const auto &kv : kv_map) { + unsigned line_idx = kv.first; + unsigned msg_idx = kv.second.first; + unsigned field_idx = kv.second.second; + const std::string_view message_name = message_descs[msg_idx]->full_name(); + const std::string_view proto_field_name = + message_descs[msg_idx]->field(field_idx)->name(); + const std::string &key = key_lists[msg_idx][field_idx]; + absl::StrAppendFormat( + &ret, "\n - Message <%s:%d> Field \"%s\" (Key \"%s\") found at line %u", + message_name.data(), field_idx, proto_field_name.data(), key.c_str(), line_idx); + } + for (const auto &missing_field : missing_fields) { + unsigned msg_idx = missing_field.first; + unsigned field_idx = missing_field.second; + const std::string_view message_name = message_descs[msg_idx]->full_name(); + const std::string_view proto_field_name = + message_descs[msg_idx]->field(field_idx)->name(); + const std::string &key = key_lists[msg_idx][field_idx]; + absl::StrAppendFormat( + &ret, "\n - Message <%s:%d> Field \"%s\" (Key \"%s\") is missing", + message_name.data(), field_idx, proto_field_name.data(), key.c_str()); + } + } + return ret; +} diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..1f80f1a --- /dev/null +++ b/parser.py @@ -0,0 +1,67 @@ +import os +import re +from collections import defaultdict + + +def parse_token_distribution(log_dir): + """ + Parses log files in the specified directory to calculate the distribution + of max token sizes. + + Args: + log_dir (str): Path to the directory containing log files. + + Returns: + dict: A dictionary where keys are token sizes and values are their counts. + """ + token_distribution = defaultdict(int) + log_pattern = re.compile(r"wiki_batch_(\d+)\.log") + token_size_pattern = re.compile(r"Max Token Size in Batch: (\d+)") + + # Iterate through all files in the directory + for file_name in os.listdir(log_dir): + if log_pattern.match(file_name): # Match log file pattern + file_path = os.path.join(log_dir, file_name) + print(f"Processing file: {file_path}") + # ofile.write(f"Processing file: {file_path}\n") + batch_num = int(log_pattern.match(file_name).group(1)) + + output_file = os.path.join(log_dir, f"out_{batch_num}.txt") + ofile = open(output_file, "w") + with open(file_path, "r") as log_file: + for line in log_file: + match = token_size_pattern.search(line) + if match: + token_size = int(match.group(1)) + token_distribution[token_size] += 1 + + for token_size, count in sorted(token_distribution.items()): + print(f"Token Size: {token_size}, Count: {count}") + ofile.write(f"{token_size}, {count}\n") + token_distribution = defaultdict(int) # Reset for the next batch + ofile.close() + # Print the distribution + return token_distribution + + +def main(): + log_directory = ( + "/mnt/nvme1n1/shaobol2/results" # Replace with the actual path to your log files + ) + distribution = parse_token_distribution(log_directory) + + # Print the distribution + # print("Token Size Distribution:") + # for token_size, count in sorted(distribution.items()): + # print(f"Token Size: {token_size}, Count: {count}") + + # # Optionally, save the distribution to a file + # output_file = "token_size_distribution.txt" + # with open(output_file, "w") as f: + # for token_size, count in sorted(distribution.items()): + # f.write(f"Token Size: {token_size}, Count: {count}\n") + # print(f"Distribution saved to {output_file}") + + +if __name__ == "__main__": + main() diff --git a/req.txt b/req.txt new file mode 100644 index 0000000..617849a --- /dev/null +++ b/req.txt @@ -0,0 +1,327 @@ +absl-py==2.3.1 +accelerate==1.10.1 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +airportsdata==20250811 +annotated-types==0.7.0 +anyio==4.10.0 +appdirs==1.4.4 +argon2-cffi==25.1.0 +argon2-cffi-bindings==25.1.0 +astor==0.8.1 +async-timeout==4.0.3 +attrs==25.3.0 +autopep8==2.3.2 +av==15.1.0 +azure-core==1.35.0 +azure-storage-blob==12.26.0 +backoff==2.2.1 +bcrypt==5.0.0 +beautifulsoup4==4.13.5 +black==25.1.0 +blake3==1.0.5 +build==1.3.0 +cachetools==6.2.0 +cbor==1.0.0 +cbor2==5.7.0 +certifi==2025.8.3 +cffi==1.17.1 +charset-normalizer==3.4.3 +chromadb==1.2.1 +clang==14.0 +click==8.2.1 +cloudpickle==3.1.1 +coloredlogs==15.0.1 +colpali_engine==0.3.12 +compressed-tensors==0.9.2 +contourpy==1.3.2 +cryptography==45.0.7 +cupy-cuda12x==13.6.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +datasets==4.0.0 +deprecation==2.1.0 +depyf==0.18.0 +device-smi==0.4.1 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docling==2.54.0 +docling-core==2.48.2 +docling-ibm-models==3.9.1 +docling-parse==4.4.0 +docstring_parser==0.17.0 +durationpy==0.10 +easyocr==1.7.2 +einops==0.8.1 +elastic-transport==9.2.0 +elasticsearch==9.1.1 +email-validator==2.3.0 +environs==9.5.0 +et_xmlfile==2.0.0 +exceptiongroup==1.3.0 +Faker==37.6.0 +fastapi==0.116.1 +fastapi-cli==0.0.10 +fastapi-cloud-cli==0.1.5 +fastrlock==0.8.3 +filelock==3.19.1 +filetype==1.2.0 +FlagEmbedding==1.3.5 +flatbuffers==25.9.23 +fonttools==4.60.1 +frozenlist==1.7.0 +fschat==0.2.36 +fsspec==2025.3.0 +gguf==0.10.0 +gitdb==4.0.12 +GitPython==3.1.45 +google-auth==2.41.1 +googleapis-common-protos==1.71.0 +greenlet==3.2.4 +grpcio==1.75.1 +h11==0.16.0 +h2==4.3.0 +hf-xet==1.1.9 +hf_transfer==0.1.9 +hpack==4.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +httpx-sse==0.4.1 +huggingface-hub==0.34.4 +humanfriendly==10.0 +hyperframe==6.1.0 +idna==3.10 +ijson==3.4.0 +imageio==2.37.0 +importlib_metadata==8.7.0 +importlib_resources==6.5.2 +iniconfig==2.1.0 +inscriptis==2.6.0 +instructor==1.11.2 +interegular==0.3.3 +ir_datasets==0.5.11 +isodate==0.7.2 +Jinja2==3.1.6 +jiter==0.10.0 +joblib==1.5.2 +jsonlines==3.1.0 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonref==1.1.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +kiwisolver==1.4.9 +kubernetes==34.1.0 +lancedb==0.24.3 +langchain==0.3.27 +langchain-community==0.3.29 +langchain-core==0.3.75 +langchain-openai==0.3.32 +langchain-text-splitters==0.3.11 +langsmith==0.4.27 +lark==1.2.2 +latex2mathml==3.78.1 +lazy_loader==0.4 +llguidance==0.7.30 +llvmlite==0.43.0 +lm-format-enforcer==0.10.12 +logbar==0.0.4 +lxml==5.4.0 +lz4==4.4.4 +markdown-it-py==4.0.0 +markdown2==2.5.4 +marko==2.2.0 +MarkupSafe==3.0.2 +marshmallow==3.26.1 +matplotlib==3.10.7 +mdurl==0.1.2 +milvus==2.3.5 +milvus-lite==2.5.1 +minio==7.2.16 +mistral_common==1.8.4 +mmh3==5.2.0 +mpire==2.10.2 +mpmath==1.3.0 +msgpack==1.1.1 +msgspec==0.19.0 +multidict==6.6.4 +multiprocess==0.70.16 +mypy_extensions==1.1.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +nh3==0.3.0 +ninja==1.13.0 +numba==0.60.0 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusparselt-cu12==0.6.2 +nvidia-ml-py==13.580.65 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +oauthlib==3.3.1 +onnxruntime==1.23.1 +openai==1.107.0 +openai-harmony==0.0.4 +opencv-python-headless==4.11.0.86 +openpyxl==3.1.5 +opentelemetry-api==1.38.0 +opentelemetry-exporter-otlp-proto-common==1.38.0 +opentelemetry-exporter-otlp-proto-grpc==1.38.0 +opentelemetry-proto==1.38.0 +opentelemetry-sdk==1.38.0 +opentelemetry-semantic-conventions==0.59b0 +optimum==1.27.0 +orjson==3.11.3 +outlines==0.1.11 +outlines_core==0.1.26 +overrides==7.7.0 +packaging==25.0 +pandas==2.3.2 +partial-json-parser==0.2.1.1.post6 +pathspec==0.12.1 +pdf2image==1.17.0 +peft==0.16.0 +pillow==11.3.0 +pip-tools==7.5.0 +pipdeptree==2.28.0 +platformdirs==4.4.0 +pluggy==1.6.0 +polyfactory==2.22.2 +portalocker==3.2.0 +posthog==5.4.0 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.22.1 +prompt_toolkit==3.0.52 +propcache==0.3.2 +protobuf==6.32.0 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==21.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pybase64==1.4.2 +pybind11-stubgen==2.5.5 +pyclipper==1.3.0.post6 +pycodestyle==2.14.0 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.23.0 +pydantic==2.11.7 +pydantic-extra-types==2.10.5 +pydantic-settings==2.10.1 +pydantic_core==2.33.2 +Pygments==2.19.2 +pylatexenc==2.10 +pymilvus==2.6.2 +pymilvus.model==0.3.2 +pynvml==13.0.1 +pyparsing==3.2.5 +pypdfium2==4.30.0 +PyPika==0.48.9 +pyproject_hooks==1.2.0 +pytest==8.4.2 +python-bidi==0.6.6 +python-dateutil==2.9.0.post0 +python-docx==1.2.0 +python-dotenv==1.1.1 +python-json-logger==3.3.0 +python-multipart==0.0.20 +python-pptx==1.0.2 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==27.0.2 +qdrant-client==1.15.1 +qwen-vl-utils==0.0.14 +ragas==0.3.3 +random_word==1.0.13 +ray==2.49.1 +referencing==0.36.2 +regex==2025.9.1 +requests==2.32.5 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +responses==0.18.0 +rich==14.1.0 +rich-toolkit==0.15.1 +rignore==0.6.4 +rpds-py==0.27.1 +rsa==4.9.1 +rtree==1.4.1 +safetensors==0.6.2 +scikit-image==0.25.2 +scikit-learn==1.7.1 +scipy==1.15.3 +semchunk==2.2.2 +sentence-transformers==5.1.0 +sentencepiece==0.2.1 +sentry-sdk==2.37.0 +setproctitle==1.3.7 +shapely==2.1.1 +shellingham==1.5.4 +shortuuid==1.0.13 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +soundfile==0.13.1 +soupsieve==2.8 +soxr==0.5.0.post1 +SQLAlchemy==2.0.43 +starlette==0.47.3 +svgwrite==1.4.3 +sympy==1.13.1 +tabulate==0.9.0 +tenacity==9.1.2 +threadpoolctl==3.6.0 +tifffile==2025.5.10 +tiktoken==0.11.0 +tokenicer==0.0.4 +tokenizers==0.21.4 +tomli==2.2.1 +torch==2.6.0 +torchaudio==2.6.0 +torchvision==0.21.0 +tqdm==4.67.1 +transformers==4.53.3 +trec-car-tools==2.6 +triton==3.2.0 +typer==0.16.1 +typing-inspect==0.9.0 +typing-inspection==0.4.1 +typing_extensions==4.15.0 +tzdata==2025.2 +ujson==5.11.0 +unlzw3==0.2.3 +urllib3==2.3.0 +uv==0.8.15 +uvicorn==0.35.0 +uvloop==0.21.0 +vllm==0.8.0 +warc3-wet==0.2.5 +warc3-wet-clueweb09==0.2.5 +watchfiles==1.1.0 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.9.0 +websockets==15.0.1 +xformers==0.0.29.post2 +xgrammar==0.1.16 +xlsxwriter==3.2.5 +xxhash==3.5.0 +yarl==1.20.1 +zipp==3.23.0 +zlib-state==0.1.9 +zstandard==0.24.0 diff --git a/resource/.gitignore b/resource/.gitignore new file mode 100644 index 0000000..04a6e8d --- /dev/null +++ b/resource/.gitignore @@ -0,0 +1 @@ +generated/* \ No newline at end of file diff --git a/resource/bash_utils.sh b/resource/bash_utils.sh new file mode 100644 index 0000000..6301b28 --- /dev/null +++ b/resource/bash_utils.sh @@ -0,0 +1,431 @@ +#!/bin/bash + +# Prepend this before every script that use this utils file +# script_dir="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +# script_name="$( basename -- "${BASH_SOURCE[0]}" )" +# source "$script_dir"/bash_utils.sh || exit 254 # replace with correct relative path if needed + +# Get script path +# script_dir="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +# Color strings {{{ +# Respect env variable NO_COLOR +if [[ -z "${NO_COLOR}" ]]; then + RED="$(tput setaf 1)" + GREEN="$(tput setaf 2)" + YELLOW="$(tput setaf 3)" + BLUE="$(tput setaf 4)" + MAGENTA="$(tput setaf 5)" + CYAN="$(tput setaf 6)" + ENDC="$(tput sgr0)" +else + RED="" + GREEN="" + YELLOW="" + BLUE="" + MAGENTA="" + CYAN="" + ENDC="" +fi +# }}} + +printerr() { + # shellcheck disable=SC2059 + printf "$1" "${@:2}" >&2 +} + +wprinterr() { + # shellcheck disable=SC2059 + printf "${YELLOW}$1${ENDC}" "${@:2}" >&2 +} + +eprinterr() { + # shellcheck disable=SC2059 + printf "${RED}$1${ENDC}" "${@:2}" >&2 +} + +dump_stack() { + # $1 skip stack level + if [ "$#" -eq 1 ]; then + local i="$1" + else + local i=0 + fi + local line_no function_name file_name + wprinterr "Traceback (most recent call last):\n" + while caller "$i"; do + (( i++ )) + done | while read -r line_no function_name file_name; do + wprinterr " File \"%s\", line %s, in %s\n" "$file_name" "$line_no" "$function_name" + wprinterr " %s\n" "$(sed "${line_no}q;d" "$file_name" | sed "s/^\s*//g")" + done +} + +show_current_stackframe() { + # $1 skip stack level + if [ "$#" -eq 1 ]; then + local i="$1" + else + local i=0 + fi + local line_no function_name file_name + caller "$i" | while read -r line_no function_name file_name; do + printerr "Abort at file \"%s\", line %s, in %s\n" "$file_name" "$line_no" "$function_name" + done +} + +__assert() { + # $1 dump stack skip level + # $2 should the value equal to zero? + # $3 [optional] value to be judge on + # $4 [optional] message + # $5... [optional] arguments to be formatted + # return: 0 on success, 1 otherwise, 255 internal error + local err_str="Assertion Failed" + if [ "$#" -eq 0 ]; then + printf "__assert() internal error, called with no args\n" + exit 255 + elif [ "$#" -eq 1 ] || [ "$#" -eq 2 ]; then + # assert with no args + : + elif { [ "$2" -ne 0 ] && [ "$3" -ne 0 ]; } || { [ "$2" -eq 0 ] && [ "$3" -eq 0 ]; }; then + if [ "$#" -ge 4 ]; then err_str="$err_str: $4"; fi + else + # assert not triggered + return 0 + fi + printf "$err_str\n" "${@:5}" + dump_stack $(( 1 + "$1" )) + return 1 +} + +exit_on_retval() { + # $1 override return value, 0 to forward + local retval=$? + local act_retval=0 + if [ "$#" -eq 0 ]; then + act_retval="$retval" + elif [ "$#" -eq 1 ]; then + act_retval="$1" + if ! is_valid_errcode "$act_retval"; then + act_retval="${predef_errcode_revmap[$act_retval]}" + fi + if [[ $act_retval =~ ^[0-9]+$ ]] && (( "$act_retval" >= 0 )) && (( "$act_retval" <= 255 )); then + [ "$act_retval" -eq 0 ] && act_retval="$retval" + else + assert 0 \ + "exit_on_retval internal error, return value <%s> is not an integer between 0 and 255" \ + "$act_retval" + exit 255 + fi + else + assert 0 "exit_on_retval internal error, argument ill-formatted" + exit 255 + fi + if [ "$retval" -ne 0 ]; then exit "$act_retval"; fi + return $retval +} + +# assert & assert_zero +# $1 [optional] value to be judge on +# $2 [optional] message +# $3... [optional] arguments to be formatted +# return 1 if assertion failed +assert() { __assert 1 0 "$@"; } +assert_zero() { __assert 1 1 "$@"; } + +# assert_exit_default & assert_zero_exit_default +# $1 [optional] value to be judge on +# $2 [optional] message +# $3... [optional] arguments to be formatted +# exit 1 if assertion failed +assert_exit_default() { __assert 1 0 "$@"; exit_on_retval; } +assert_zero_exit_default() { __assert 1 1 "$@"; exit_on_retval; } + + +# assert_exit & assert_zero_exit +# $1 exit code when assertion failed +# $2 [optional] value to be judge on +# $3 [optional] message +# $4... [optional] arguments to be formatted +# exit if assertion failed +assert_exit() { __assert 1 0 "${@:2}"; exit_on_retval "$1"; } +assert_zero_exit() { __assert 1 1 "${@:2}"; exit_on_retval "$1"; } + +# assert_false_exit +# $1 exit code when assertion failed +# $2 [optional] message +# $3... [optional] arguments to be formatted +# exit if assertion failed +assert_false_exit() { __assert 1 1 1 "${@:2}"; exit_on_retval "$1"; } + +declare -A predef_errcode +declare -A predef_errcode_revmap +declare -A predef_errcode_dscr + +is_valid_errcode() { + [[ "$1" =~ ^[0-9]+$ ]] && [ "$1" -ge 0 ] && [ "$1" -le 255 ] + return $? +} + +define_errcode() { + #1 errcode + #2 errstr + #3 errdscr + local errcode="$1" + local errstr="$2" + local errdscr="$3" + if ! is_valid_errcode "$errcode"; then + assert_false_exit 255 \ + "Error when defining errcode, exit code <%s> is not an integer between 0 and 255" \ + "$errcode" + fi + if is_valid_errcode "$errstr"; then + assert_false_exit 255 \ + "Error when defining errcode, error string <%s> cannot be an integer between 0 and 255" \ + "$errstr" + fi + if [[ -v predef_errcode[$errcode] ]] && [[ $err_str != "${predef_errcode[$errcode]}" ]]; then + assert_false_exit 255 \ + "Error when defining errcode, errcode %d already defined with errstr %s" \ + "$errcode" "${predef_errcode[$errcode]}" + fi + + predef_errcode[$errcode]="$errstr" + predef_errcode_revmap[$errstr]=$errcode + predef_errcode_dscr[$errcode]="$errdscr" +} + +get_errcode_from_errstr() { + local errstr="$1" + [[ -v predef_errcode_revmap[$errstr] ]] || assert_false_exit 255 \ + "Error when defining errcode, error string <%s> cannot be an integer between 0 and 255" \ + "$errstr" + printf "%d" "${predef_errcode_revmap[$errstr]}" +} + +# predefined error codes +define_errcode 0 "normal_termination" "script terminates correctly" +define_errcode 253 "user_abort" "user abort" +define_errcode 254 "dependency_error" "dependency error" +define_errcode 255 "internal_error" "internal error" + +display_predef_errcode() { + printerr "Return values\n" + for errcode in $(echo "${!predef_errcode_dscr[@]}" | xargs -n1 | sort -h); do + local errcode_dscr="${predef_errcode_dscr[$errcode]}" + printerr " %-3s %s\n" "$errcode" "$errcode_dscr" + done +} + +abort() { + # $1 error code/error string + # $3 [optional] message + # $4... [optional] arguments to be formatted + local exit_input="$1" + local extra_dscr="" + if is_valid_errcode "$exit_input"; then + exit_code="$exit_input" + else + exit_code="${predef_errcode_revmap[$exit_input]}" + fi + if [[ -v predef_errcode[$exit_code] ]]; then extra_dscr=" (${predef_errcode[$exit_code]})"; fi + if [[ $# -ge 2 ]]; then + printf "Abort%s: $2\n" "$extra_dscr" "${@:3}" + else + printf "Abort%s\n" "$extra_dscr" + fi + show_current_stackframe 1 + exit "$exit_code" +} + +check_and_abort() { + # $1 exit code + # $3 [optional] message + # $4... [optional] arguments to be formatted + if [[ $1 -ge 64 ]]; then + abort "${@}" + fi +} + +check_dependency() { + # $1 check type + # $2 dependency + # $3 [optional] verbose (default to true) + local retval + test -"$1" "$2" + retval=$? + if [ $retval -eq 2 ]; then + assert_false_exit internal_error + elif [ $retval -eq 0 ]; then + printf "%s" "$(realpath "$2")" + return 0 + else + printf "%s" "$2" + if [ $# -le 2 ]; then + printerr "Dependency <%s> not found\n" "$2" + fi + return "$(get_errcode_from_errstr dependency_error)" + fi +} + +pretty_countdown() { + [ "$#" -ge 2 ]; assert_zero_exit $? + + local from=$2 + local interval=1 + local wait_time remaining + for wait_time in $(seq 0 "$interval" "$(echo "$from + $interval - 1" | bc)"); do + remaining="$(echo "scale=9; $from - $wait_time" | bc)" + printf "\r\033[2K%s %s" "$1" "$remaining" + sleep "$(echo "$interval + (($interval > $remaining) * ($remaining - $interval))" | bc)" + done + printf "\r\033[2K%s 0" "$1" + printf "\n" +} + +display_time() { + printf "### Current time BEGIN ###########################################\n" + timedatectl | sed -e "s/^/# /" + printf "### Current time END #############################################\n" +} + +# The input content should follow the below format: +# 1. Each line is treated as an entry +# 2. The key and value is separated using specified delimiter (could be multiple char) +# Note that: +# 1. Delimiter searching is greedy, from left to right +as_associative_arr() { + # $1 variable name, should be an associate array + # $2 content + # $3 delimiter + [ "$#" -eq 3 ]; assert_zero $? + + local row rows assignment_expr + readarray rows <<< "$2" + for row in "${rows[@]}"; do + assignment_expr="$(sed -re "s/([^\\$3 ]*)\s*\\$3\s*(.*)\s*/[\"\1\"]=\"\2\"/" <<< "$row")" + if [[ $assignment_expr =~ \[.*\]=.* ]]; then + eval "$1$assignment_expr" + else + printerr "Error when translate line \"%s\", skipping insertion\n" "$row" + fi + done +} + +# https://stackoverflow.com/questions/1527049/how-can-i-join-elements-of-a-bash-array-into-a-delimited-string +str_join() { + # $1 delimiter + # $2... strings to be concatenated + local d=${1-} f=${2-} + if shift 2; then + printf %s "$f" "${@/#/$d}" + fi +} + +display_options() { + # $1 option list + # $2 default option + # $3 result var name + # $4 print option long descriptions + # $5 [optional] message + # $6... [optional] arguments to be formatted + local -n options_ref="$1" + local display_options=() + local default_selection="$2" + local result_var="$3" + local message="$5" + local short_format=1 + local option_selected=0 + # determine if option is using short format + for option in "${!options_ref[@]}"; do + if [[ ${#option} -ne 1 ]]; then + short_format=0 + break; + fi + done + # display header message + if [[ -z $message ]]; then message="Select from options"; fi + # shellcheck disable=SC2059 + printf "$message\n" "${@:6}" + for option in "${!options_ref[@]}"; do + if [[ $short_format -ne 0 ]]; then + [[ $option == [a-z] ]] || \ + assert_false_exit 255 "%s internal error, option <%s> is ill-formatted, not matching regex [a-z]" \ + "${FUNCNAME[0]}" "$option" + if [[ "$default_selection" == "$option" ]]; then + option="${option^^}" + option_selected=1 + fi + print_delimiter="" + else + [[ $option =~ [a-z]+ ]] || \ + assert_false_exit 255 "%s internal error, option <%s> is ill-formatted, not matching regex [a-z]+" \ + "${FUNCNAME[0]}" "$option" + if [[ "$default_selection" == "$option" ]]; then + option="[$option]" + option_selected=1 + fi + print_delimiter="/" + fi + display_options+=( "$option" ) + done + [[ $option_selected -eq 1 ]] || \ + assert_false_exit 255 "Default option %s is not found in option list" "$default_selection" + # display option and help messages + local max_opt_len=0 + local default_mark_str="(default) " + for option in "${!options_ref[@]}"; do + local opt_len="${#option}" + max_opt_len=$(( max_opt_len > opt_len ? max_opt_len : opt_len )) + done + for option in "${!options_ref[@]}"; do + if [[ "$default_selection" == "$option" ]]; then default_str="$default_mark_str"; else default_str=""; fi + # shellcheck disable=SC2059 + printf " %${#default_mark_str}s%${max_opt_len}s: %s\n" "$default_str" "$option" "${options_ref[$option]}" + done + # display selection prompt + printf " Selection (%s) ? " "$(str_join "$print_delimiter" "${display_options[@]}")" + selection="" + while :; do + read -r selection + selection="${selection,,}" + if [[ -z "$selection" ]]; then selection="$default_selection"; fi + if [[ -v options_ref[$selection] ]]; then + printf -v "$result_var" "%s" "$selection" + return + else + # display re-selection prompt + printf " Invalid selection (%s) ? " "$(str_join "$print_delimiter" "${display_options[@]}")" + fi + done +} + +display_yes_no_option() { + # $1 message + # $2... [optional] arguments to be formatted + declare -A options + declare selection + options=( + ["y"]="confirm" + ["n"]="deny" + ) + display_options options n selection 0 "$@" + unset options + if [ "$selection" == "y" ]; then return 0; fi + return 1; +} + +# pretty printing +time_print_interval() { + total_time="$1" + interval="$2" + current_time=0 + while [ "$current_time" -lt "$total_time" ]; do + sleep_time=$(("$total_time" - "$current_time")) + sleep_time=$(("$sleep_time" > "$interval" ? "$interval" : "$sleep_time")) + echo "$current_time/$total_time sleep $sleep_time" + sleep "$sleep_time" + current_time=$(("$current_time" + "$interval")) + done +} diff --git a/resource/black_format/.black-format b/resource/black_format/.black-format new file mode 100644 index 0000000..6318acc --- /dev/null +++ b/resource/black_format/.black-format @@ -0,0 +1,17 @@ +[tool.black] +line-length = 100 +target-version = ["py310"] +# path from root of the repository +force-exclude = ''' +( + \.pyi$ + | \.git/ + | \.github/ + | resource/ + | third_party/ + | .*/generated/ + | build/ + | .*/__pycache__/ + | .*/*_pb2.py +) +''' diff --git a/resource/build_helper/libclang_get_lib_version.py b/resource/build_helper/libclang_get_lib_version.py new file mode 100644 index 0000000..df3364e --- /dev/null +++ b/resource/build_helper/libclang_get_lib_version.py @@ -0,0 +1,56 @@ +# Get current system libclang version +# Usage: python3 libclang_get_lib_version.py [path_to_libclang.so] +# Outputs: the version number of the libclang shared library +# Returns: the version number of the libclang shared library + +import ctypes +import sys, subprocess + +# Determine libclang_path +if len(sys.argv) == 1: + # Find using ldconfig + p = subprocess.Popen(["ldconfig", "-p"], stdout=subprocess.PIPE, stderr=sys.stderr) + sharedlibs, _ = p.communicate() + assert p.returncode == 0, "Failed to run ldconfig command." + + import re + line = [ + line.strip() + for line in sharedlibs.decode().splitlines() + if re.search(r"libclang-[0-9]+", line) + ][-1] # Get the last line that matches the regex + libclang_path = line.split("=>")[-1].strip() +elif len(sys.argv) == 2: + # Load a given libclang.so + libclang_path = sys.argv[1] +else: + sys.exit(1) + +lib = ctypes.CDLL(libclang_path) + +# Define CXString struct +class CXString(ctypes.Structure): + _fields_ = [ + ("data", ctypes.c_void_p), + ("private_flags", ctypes.c_uint) + ] + +# Declare function signatures +lib.clang_getClangVersion.restype = CXString +lib.clang_getCString.argtypes = [CXString] +lib.clang_getCString.restype = ctypes.c_char_p +lib.clang_disposeString.argtypes = [CXString] + +# Call the function +version = lib.clang_getClangVersion() +version_str = lib.clang_getCString(version).decode() +lib.clang_disposeString(version) + +import sys, re +# Extract the version number using regex +match = re.search(r'version (\d+\.\d+\.\d+)', version_str) +if match: + version_number = match.group(1) + print(version_number) +else: + sys.exit(1) \ No newline at end of file diff --git a/resource/build_helper/py3_require_executable_module.py b/resource/build_helper/py3_require_executable_module.py new file mode 100644 index 0000000..158341c --- /dev/null +++ b/resource/build_helper/py3_require_executable_module.py @@ -0,0 +1,25 @@ +import os, sys +import importlib.util + +if len(sys.argv) != 2: + print( + f"{os.path.basename(sys.argv[0])} requires a module name as an argument", + file=sys.stderr) + exit(2) + +def is_executable(module_name: str) -> bool: + # First check if the module can be imported + spec = importlib.util.find_spec(module_name) + if spec is None: + return False + + # If it's a package, try looking for module_name.__main__ + # (rather than module_name/__main__.py directly) + main_spec = importlib.util.find_spec(f"{module_name}.__main__") + if main_spec is not None and main_spec.origin and main_spec.origin.endswith("__main__.py"): + return True + + return False + +if not is_executable(sys.argv[1]): + exit(1) diff --git a/resource/build_helper/py3_require_package.py b/resource/build_helper/py3_require_package.py new file mode 100644 index 0000000..8c1970c --- /dev/null +++ b/resource/build_helper/py3_require_package.py @@ -0,0 +1,34 @@ +# Check for Python package dependencies +# Usage: python3 py3_require_package.py "package_name[specifier]" +# Outputs: the installed version for the package if the requirement is met +# Returns: 0 if the requirement is met, non-zero otherwise + +from importlib.metadata import version, PackageNotFoundError +from packaging.requirements import Requirement +import sys, os + +if "NO_COLOR" in os.environ and len(os.environ["NO_COLOR"]) != 0: + RED = "" + RESET = "" +else: + RED = "\033[31m" + RESET = "\033[0m" + +assert len(sys.argv) >= 2, "Usage: python3 py3_require_package.py 'package_name[specifier]'" + +req = Requirement(sys.argv[1]) +try: + installed_version = version(req.name) +except PackageNotFoundError: + print( + f"{RED}[PyPkg Dependency Checker] Package {req.name} is not installed.{RESET}", + file=sys.stderr) + sys.exit(1) +if installed_version not in req.specifier: + print( + f"{RED}[PyPkg Dependency Checker] Package {req.name} is installed " + f"(version {installed_version}) but does not satisfy the requirement: " + f"{req.name}{req.specifier}{RESET}", + file=sys.stderr) + sys.exit(1) +print(installed_version) \ No newline at end of file diff --git a/resource/clang_format/.clang-format b/resource/clang_format/.clang-format new file mode 100644 index 0000000..be93367 --- /dev/null +++ b/resource/clang_format/.clang-format @@ -0,0 +1,23 @@ +Language: Cpp +BasedOnStyle: Google + +IndentWidth: 4 +ColumnLimit: 100 +IndentAccessModifiers: false +AccessModifierOffset: -2 + +IncludeBlocks: Preserve + +DerivePointerAlignment: false +PointerAlignment: Right +AlignAfterOpenBracket: AlwaysBreak +AlignConsecutiveMacros: true +AlignEscapedNewlines: Right + +ReflowComments: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 + +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: false diff --git a/resource/clang_format/run_clang_format.py b/resource/clang_format/run_clang_format.py new file mode 100644 index 0000000..a657125 --- /dev/null +++ b/resource/clang_format/run_clang_format.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python +"""A wrapper script around clang-format, suitable for linting multiple files +and to use for continuous integration. + +This is an alternative API for the clang-format command line. +It runs over multiple files and directories in parallel. +A diff output is produced and a sensible exit code is returned. + +Adapted from: https://github.com/Sarcasm/run-clang-format/blob/master/run-clang-format.py + +""" + +from __future__ import print_function, unicode_literals + +import argparse +import codecs +import difflib +import fnmatch +import io +import errno +import multiprocessing +import os +import signal +import subprocess +import sys +import traceback + +from functools import partial + +try: + from subprocess import DEVNULL # py3k +except ImportError: + DEVNULL = open(os.devnull, "wb") + + +DEFAULT_EXTENSIONS = 'c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx' +DEFAULT_CLANG_FORMAT_IGNORE = '.clang-format-ignore' + + +class ExitStatus: + SUCCESS = 0 + DIFF = 1 + TROUBLE = 2 + +def excludes_from_files(ignore_files): + excludes = [] + + for ignore_file in ignore_files: + ignore_file_dir = os.path.dirname(ignore_file) + try: + with io.open(ignore_file, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('#'): + # ignore comments + continue + pattern = line.rstrip() + if not pattern: + # allow empty lines + continue + excludes.append(f"{ignore_file_dir}/{pattern}") + except EnvironmentError as e: + if e.errno != errno.ENOENT: + raise + return excludes; + +def list_files(files, recursive=False, extensions=None, exclude=None): + if extensions is None: + extensions = [] + if exclude is None: + exclude = [] + + out = [] + for file in files: + if recursive and os.path.isdir(file): + for dirpath, dnames, fnames in os.walk(file): + fpaths = [os.path.join(dirpath, fname) for fname in fnames] + for pattern in exclude: + # os.walk() supports trimming down the dnames list + # by modifying it in-place, + # to avoid unnecessary directory listings. + dnames[:] = [ + x for x in dnames + if + not fnmatch.fnmatch(os.path.join(dirpath, x), pattern) + ] + fpaths = [ + x for x in fpaths if not fnmatch.fnmatch(x, pattern) + ] + for f in fpaths: + ext = os.path.splitext(f)[1][1:] + if ext in extensions: + out.append(f) + else: + out.append(file) + return out + + +def make_diff(file, original, reformatted): + return list( + difflib.unified_diff( + original, + reformatted, + fromfile='{}\t(original)'.format(file), + tofile='{}\t(reformatted)'.format(file), + n=3)) + + +class DiffError(Exception): + def __init__(self, message, errs=None): + super(DiffError, self).__init__(message) + self.errs = errs or [] + + +class UnexpectedError(Exception): + def __init__(self, message, exc=None): + super(UnexpectedError, self).__init__(message) + self.formatted_traceback = traceback.format_exc() + self.exc = exc + + +def run_clang_format_diff_wrapper(args, file): + try: + ret = run_clang_format_diff(args, file) + return ret + except DiffError: + raise + except Exception as e: + raise UnexpectedError('{}: {}: {}'.format(file, e.__class__.__name__, + e), e) + + +def run_clang_format_diff(args, file): + try: + with io.open(file, 'r', encoding='utf-8') as f: + original = f.readlines() + except IOError as exc: + raise DiffError(str(exc)) + + if args.in_place: + invocation = [args.clang_format_executable, '-i', file] + else: + invocation = [args.clang_format_executable, file] + + if args.style: + invocation.extend(['--style', args.style]) + + if args.clang_format_style_file: + invocation.extend([f'-style=file:{args.clang_format_style_file}']) + + if args.dry_run: + print(" ".join(invocation)) + return [], [] + + # Use of utf-8 to decode the process output. + # + # Hopefully, this is the correct thing to do. + # + # It's done due to the following assumptions (which may be incorrect): + # - clang-format will returns the bytes read from the files as-is, + # without conversion, and it is already assumed that the files use utf-8. + # - if the diagnostics were internationalized, they would use utf-8: + # > Adding Translations to Clang + # > + # > Not possible yet! + # > Diagnostic strings should be written in UTF-8, + # > the client can translate to the relevant code page if needed. + # > Each translation completely replaces the format string + # > for the diagnostic. + # > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation + # + # It's not pretty, due to Python 2 & 3 compatibility. + encoding_py3 = {} + if sys.version_info[0] >= 3: + encoding_py3['encoding'] = 'utf-8' + + try: + proc = subprocess.Popen( + invocation, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + **encoding_py3) + except OSError as exc: + raise DiffError( + "Command '{}' failed to start: {}".format( + subprocess.list2cmdline(invocation), exc + ) + ) + proc_stdout = proc.stdout + proc_stderr = proc.stderr + if sys.version_info[0] < 3: + # make the pipes compatible with Python 3, + # reading lines should output unicode + encoding = 'utf-8' + proc_stdout = codecs.getreader(encoding)(proc_stdout) + proc_stderr = codecs.getreader(encoding)(proc_stderr) + # hopefully the stderr pipe won't get full and block the process + outs = list(proc_stdout.readlines()) + errs = list(proc_stderr.readlines()) + proc.wait() + if proc.returncode: + raise DiffError( + "Command '{}' returned non-zero exit status {}".format( + subprocess.list2cmdline(invocation), proc.returncode + ), + errs, + ) + if args.in_place: + return [], errs + return make_diff(file, original, outs), errs + + +def bold_red(s): + return '\x1b[1m\x1b[31m' + s + '\x1b[0m' + + +def colorize(diff_lines): + def bold(s): + return '\x1b[1m' + s + '\x1b[0m' + + def cyan(s): + return '\x1b[36m' + s + '\x1b[0m' + + def green(s): + return '\x1b[32m' + s + '\x1b[0m' + + def red(s): + return '\x1b[31m' + s + '\x1b[0m' + + for line in diff_lines: + if line[:4] in ['--- ', '+++ ']: + yield bold(line) + elif line.startswith('@@ '): + yield cyan(line) + elif line.startswith('+'): + yield green(line) + elif line.startswith('-'): + yield red(line) + else: + yield line + + +def print_diff(diff_lines, use_color): + if use_color: + diff_lines = colorize(diff_lines) + if sys.version_info[0] < 3: + sys.stdout.writelines((l.encode('utf-8') for l in diff_lines)) + else: + sys.stdout.writelines(diff_lines) + + +def print_trouble(prog, message, use_colors): + error_text = 'error:' + if use_colors: + error_text = bold_red(error_text) + print("{}: {} {}".format(prog, error_text, message), file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + '--clang-format-executable', + metavar='EXECUTABLE', + help='path to the clang-format executable', + default='clang-format') + parser.add_argument( + '--extensions', + help='comma separated list of file extensions (default: {})'.format( + DEFAULT_EXTENSIONS), + default=DEFAULT_EXTENSIONS) + parser.add_argument( + '-r', + '--recursive', + action='store_true', + help='run recursively over directories') + parser.add_argument( + '-d', + '--dry-run', + action='store_true', + help='just print the list of files') + parser.add_argument( + '-i', + '--in-place', + action='store_true', + help='format file instead of printing differences') + parser.add_argument('files', metavar='file', nargs='+') + parser.add_argument( + '-q', + '--quiet', + action='store_true', + help="disable output, useful for the exit code") + parser.add_argument( + '-j', + metavar='N', + type=int, + default=0, + help='run N clang-format jobs in parallel' + ' (default number of cpus + 1)') + parser.add_argument( + '--color', + default='auto', + choices=['auto', 'always', 'never'], + help='show colored diff (default: auto)') + parser.add_argument( + '-e', + '--exclude', + metavar='PATTERN', + action='append', + default=[], + help='exclude paths matching the given glob-like pattern(s)' + ' from recursive search') + parser.add_argument( + '-n', + '--clang-format-ignore', + metavar='IGNORE_FILE', + action='append', + default=[], + help='exclude paths matching the given glob-like pattern(s) from clang-format') + parser.add_argument( + '--clang-format-style-file', + metavar='STYLE_FILE', + help='use the given file as a clang-format style file,' + ' overrides --style option') + parser.add_argument( + '--style', + help='formatting style to apply (LLVM, Google, Chromium, Mozilla, WebKit)') + + args = parser.parse_args() + + # use default signal handling, like diff return SIGINT value on ^C + # https://bugs.python.org/issue14229#msg156446 + signal.signal(signal.SIGINT, signal.SIG_DFL) + try: + signal.SIGPIPE + except AttributeError: + # compatibility, SIGPIPE does not exist on Windows + pass + else: + signal.signal(signal.SIGPIPE, signal.SIG_DFL) + + colored_stdout = False + colored_stderr = False + if args.color == 'always': + colored_stdout = True + colored_stderr = True + elif args.color == 'auto': + colored_stdout = sys.stdout.isatty() + colored_stderr = sys.stderr.isatty() + + version_invocation = [args.clang_format_executable, str("--version")] + try: + subprocess.check_call(version_invocation, stdout=DEVNULL) + except subprocess.CalledProcessError as e: + print_trouble(parser.prog, str(e), use_colors=colored_stderr) + return ExitStatus.TROUBLE + except OSError as e: + print_trouble( + parser.prog, + "Command '{}' failed to start: {}".format( + subprocess.list2cmdline(version_invocation), e + ), + use_colors=colored_stderr, + ) + return ExitStatus.TROUBLE + + retcode = ExitStatus.SUCCESS + + clang_format_ignore_files = [DEFAULT_CLANG_FORMAT_IGNORE] + if args.clang_format_ignore: + clang_format_ignore_files.extend(args.clang_format_ignore) + excludes = excludes_from_files(clang_format_ignore_files) + excludes.extend(args.exclude) + + files = list_files( + args.files, + recursive=args.recursive, + exclude=excludes, + extensions=args.extensions.split(',')) + + if not files: + return + + njobs = args.j + if njobs == 0: + njobs = multiprocessing.cpu_count() + 1 + njobs = min(len(files), njobs) + + if njobs == 1: + # execute directly instead of in a pool, + # less overhead, simpler stacktraces + it = (run_clang_format_diff_wrapper(args, file) for file in files) + pool = None + else: + pool = multiprocessing.Pool(njobs) + it = pool.imap_unordered( + partial(run_clang_format_diff_wrapper, args), files) + pool.close() + while True: + try: + outs, errs = next(it) + except StopIteration: + break + except DiffError as e: + print_trouble(parser.prog, str(e), use_colors=colored_stderr) + retcode = ExitStatus.TROUBLE + sys.stderr.writelines(e.errs) + except UnexpectedError as e: + print_trouble(parser.prog, str(e), use_colors=colored_stderr) + sys.stderr.write(e.formatted_traceback) + retcode = ExitStatus.TROUBLE + # stop at the first unexpected error, + # something could be very wrong, + # don't process all files unnecessarily + if pool: + pool.terminate() + break + else: + sys.stderr.writelines(errs) + if outs == []: + continue + outs.append("\n") + if not args.quiet: + print_diff(outs, use_color=colored_stdout) + if retcode == ExitStatus.SUCCESS: + retcode = ExitStatus.DIFF + if pool: + pool.join() + return retcode + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/resource/proto/cpu_metrics.proto b/resource/proto/cpu_metrics.proto new file mode 100644 index 0000000..d92bc93 --- /dev/null +++ b/resource/proto/cpu_metrics.proto @@ -0,0 +1,48 @@ +syntax = "proto3"; + +message CoreStat { + optional uint64 user = 1; + optional uint64 nice = 2; + optional uint64 system = 3; + optional uint64 idle = 4; + optional uint64 iowait = 5; + optional uint64 irq = 6; + optional uint64 softirq = 7; + optional uint64 steal = 8; + optional uint64 guest = 9; + optional uint64 guest_nice = 10; +} + +message KernelMiscStat { + optional uint64 intr = 1; + optional uint64 ctxt = 2; + optional uint64 processes = 3; + optional uint32 procs_running = 4; + optional uint32 procs_blocked = 5; +} + +message SoftIRQStat { + optional uint64 total = 1; + optional uint64 hi = 2; + optional uint64 timer = 3; + optional uint64 net_tx = 4; + optional uint64 net_rx = 5; + optional uint64 block = 6; + optional uint64 irq_poll = 7; + optional uint64 tasklet = 8; + optional uint64 sched = 9; + optional uint64 hrtimer = 10; + optional uint64 rcu = 11; +} + +// Refer to https://man7.org/linux/man-pages/man5/proc_stat.5.html +message CPUMetrics { + optional uint64 timestamp = 1; + repeated CoreStat core_stats = 2; + optional KernelMiscStat kernel_misc_stat = 3; + optional SoftIRQStat soft_irq_stat = 4; +} + +message CPUMetricsTimeSeries { + repeated CPUMetrics metrics = 1; +} diff --git a/resource/proto/disk_metrics.proto b/resource/proto/disk_metrics.proto new file mode 100644 index 0000000..bfbf92f --- /dev/null +++ b/resource/proto/disk_metrics.proto @@ -0,0 +1,45 @@ +/* NOTE: Assumes %d and %u are 32-bit, %ld %lu, %lld, and %llu are 64-bit + * integers on a conventional 64-bit system. */ + +// Refer to https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats +// All the time is measured in milliseconds (ms). +message PerDiskMetrics { + reserved 1, 2, 3; + // [NT] uint32 major = 1; + // [NT] uint32 minor = 2; + // [NT] string dev_name = 3; + optional uint64 reads_completed = 4; + optional uint64 reads_merged = 5; + optional uint64 sectors_read = 6; + optional uint64 time_spent_reading = 7; + optional uint64 writes_completed = 8; + optional uint64 writes_merged = 9; + optional uint64 sectors_written = 10; + optional uint64 time_spent_writing = 11; + optional uint64 io_in_progress = 12; + optional uint64 time_spent_io = 13; + optional uint64 weighted_time_spent_io = 14; + optional uint64 discard_completed = 15; + optional uint64 discard_merged = 16; + optional uint64 discard_sectors = 17; + optional uint64 time_spent_discarding = 18; + optional uint64 flush_completed = 19; + optional uint64 time_spent_flushing = 20; +} + +message DiskMetrics { + optional uint64 timestamp = 1; + repeated PerDiskMetrics disk_metrics = 2; +} + +message DiskMetricsTimeSeries { + repeated DiskMetrics metrics = 1; +} + +message DiskMetadata { + optional string dev_name = 1; +} + +message DiskMetricsMetadata { + repeated DiskMetadata disk_meta = 1; +} \ No newline at end of file diff --git a/resource/proto/gpu_metrics.proto b/resource/proto/gpu_metrics.proto new file mode 100644 index 0000000..dbe6c23 --- /dev/null +++ b/resource/proto/gpu_metrics.proto @@ -0,0 +1,119 @@ +message PerProcessGPUMetrics { + optional uint32 pid = 1; + optional uint64 used_gpu_memory = 2; +} + +message PerGPUMetrics { + repeated double NVML_metrics_values = 1; + repeated double GPM_metrics_values = 2; + repeated PerProcessGPUMetrics per_process_gpu_metrics = 3; +} + +message GPUMetrics { + optional uint64 timestamp = 1; + repeated PerGPUMetrics per_gpu_metrics = 2; +} + +message GPUMetricsTimeSeries { + repeated GPUMetrics metrics = 1; +} + +message CUDACC { + optional int32 major = 1; + optional int32 minor = 2; +} + +message GPUProperties { + optional string dev_name = 1; + optional string bus_id = 2; + optional CUDACC compute_capability = 3; + optional int32 link_generation = 4; + optional int32 link_width = 5; +} + +message GPUMetadata { + enum NVMLProbe { + NVML_PCIe_throughput = 0; + NVML_METRIC_MAX = 1; + } + + // directly copied from pynvml.py, https://pythonhosted.org/nvidia-ml-py/ + enum GPMProbe { + GPM_UNSPECIFIED = 0; // Invalid metric, placeholder for 0 + GPM_GRAPHICS_UTIL = 1; // Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 + GPM_SM_UTIL = 2; // Percentage of SMs that were busy. 0.0 - 100.0 + GPM_SM_OCCUPANCY = 3; // Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 + GPM_INTEGER_UTIL = 4; // Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 + GPM_ANY_TENSOR_UTIL = 5; // Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 + GPM_DFMA_TENSOR_UTIL = 6; // Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 + GPM_HMMA_TENSOR_UTIL = 7; // Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 + GPM_IMMA_TENSOR_UTIL = 9; // Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 + GPM_DRAM_BW_UTIL = 10; // Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 + GPM_FP64_UTIL = 11; // Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 + GPM_FP32_UTIL = 12; // Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 + GPM_FP16_UTIL = 13; // Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 + GPM_PCIE_TX_PER_SEC = 20; // PCIe traffic from this GPU in MiB/sec + GPM_PCIE_RX_PER_SEC = 21; // PCIe traffic to this GPU in MiB/sec + GPM_NVDEC_0_UTIL = 30; // Percent utilization of NVDEC 0. 0.0 - 100.0 + GPM_NVDEC_1_UTIL = 31; // Percent utilization of NVDEC 1. 0.0 - 100.0 + GPM_NVDEC_2_UTIL = 32; // Percent utilization of NVDEC 2. 0.0 - 100.0 + GPM_NVDEC_3_UTIL = 33; // Percent utilization of NVDEC 3. 0.0 - 100.0 + GPM_NVDEC_4_UTIL = 34; // Percent utilization of NVDEC 4. 0.0 - 100.0 + GPM_NVDEC_5_UTIL = 35; // Percent utilization of NVDEC 5. 0.0 - 100.0 + GPM_NVDEC_6_UTIL = 36; // Percent utilization of NVDEC 6. 0.0 - 100.0 + GPM_NVDEC_7_UTIL = 37; // Percent utilization of NVDEC 7. 0.0 - 100.0 + GPM_NVJPG_0_UTIL = 40; // Percent utilization of NVJPG 0. 0.0 - 100.0 + GPM_NVJPG_1_UTIL = 41; // Percent utilization of NVJPG 1. 0.0 - 100.0 + GPM_NVJPG_2_UTIL = 42; // Percent utilization of NVJPG 2. 0.0 - 100.0 + GPM_NVJPG_3_UTIL = 43; // Percent utilization of NVJPG 3. 0.0 - 100.0 + GPM_NVJPG_4_UTIL = 44; // Percent utilization of NVJPG 4. 0.0 - 100.0 + GPM_NVJPG_5_UTIL = 45; // Percent utilization of NVJPG 5. 0.0 - 100.0 + GPM_NVJPG_6_UTIL = 46; // Percent utilization of NVJPG 6. 0.0 - 100.0 + GPM_NVJPG_7_UTIL = 47; // Percent utilization of NVJPG 7. 0.0 - 100.0 + GPM_NVOFA_0_UTIL = 50; // Percent utilization of NVOFA 0. 0.0 - 100.0 + GPM_NVOFA_1_UTIL = 51; // Percent utilization of NVOFA 1. 0.0 - 100.0 + GPM_NVLINK_TOTAL_RX_PER_SEC = 60; // NvLink read bandwidth for all links in MiB/sec + GPM_NVLINK_TOTAL_TX_PER_SEC = 61; // NvLink write bandwidth for all links in MiB/sec + GPM_NVLINK_L0_RX_PER_SEC = 62; // NvLink read bandwidth for link 0 in MiB/sec + GPM_NVLINK_L0_TX_PER_SEC = 63; // NvLink write bandwidth for link 0 in MiB/sec + GPM_NVLINK_L1_RX_PER_SEC = 64; // NvLink read bandwidth for link 1 in MiB/sec + GPM_NVLINK_L1_TX_PER_SEC = 65; // NvLink write bandwidth for link 1 in MiB/sec + GPM_NVLINK_L2_RX_PER_SEC = 66; // NvLink read bandwidth for link 2 in MiB/sec + GPM_NVLINK_L2_TX_PER_SEC = 67; // NvLink write bandwidth for link 2 in MiB/sec + GPM_NVLINK_L3_RX_PER_SEC = 68; // NvLink read bandwidth for link 3 in MiB/sec + GPM_NVLINK_L3_TX_PER_SEC = 69; // NvLink write bandwidth for link 3 in MiB/sec + GPM_NVLINK_L4_RX_PER_SEC = 70; // NvLink read bandwidth for link 4 in MiB/sec + GPM_NVLINK_L4_TX_PER_SEC = 71; // NvLink write bandwidth for link 4 in MiB/sec + GPM_NVLINK_L5_RX_PER_SEC = 72; // NvLink read bandwidth for link 5 in MiB/sec + GPM_NVLINK_L5_TX_PER_SEC = 73; // NvLink write bandwidth for link 5 in MiB/sec + GPM_NVLINK_L6_RX_PER_SEC = 74; // NvLink read bandwidth for link 6 in MiB/sec + GPM_NVLINK_L6_TX_PER_SEC = 75; // NvLink write bandwidth for link 6 in MiB/sec + GPM_NVLINK_L7_RX_PER_SEC = 76; // NvLink read bandwidth for link 7 in MiB/sec + GPM_NVLINK_L7_TX_PER_SEC = 77; // NvLink write bandwidth for link 7 in MiB/sec + GPM_NVLINK_L8_RX_PER_SEC = 78; // NvLink read bandwidth for link 8 in MiB/sec + GPM_NVLINK_L8_TX_PER_SEC = 79; // NvLink write bandwidth for link 8 in MiB/sec + GPM_NVLINK_L9_RX_PER_SEC = 80; // NvLink read bandwidth for link 9 in MiB/sec + GPM_NVLINK_L9_TX_PER_SEC = 81; // NvLink write bandwidth for link 9 in MiB/sec + GPM_NVLINK_L10_RX_PER_SEC = 82; // NvLink read bandwidth for link 10 in MiB/sec + GPM_NVLINK_L10_TX_PER_SEC = 83; // NvLink write bandwidth for link 10 in MiB/sec + GPM_NVLINK_L11_RX_PER_SEC = 84; // NvLink read bandwidth for link 11 in MiB/sec + GPM_NVLINK_L11_TX_PER_SEC = 85; // NvLink write bandwidth for link 11 in MiB/sec + GPM_NVLINK_L12_RX_PER_SEC = 86; // NvLink read bandwidth for link 12 in MiB/sec + GPM_NVLINK_L12_TX_PER_SEC = 87; // NvLink write bandwidth for link 12 in MiB/sec + GPM_NVLINK_L13_RX_PER_SEC = 88; // NvLink read bandwidth for link 13 in MiB/sec + GPM_NVLINK_L13_TX_PER_SEC = 89; // NvLink write bandwidth for link 13 in MiB/sec + GPM_NVLINK_L14_RX_PER_SEC = 90; // NvLink read bandwidth for link 14 in MiB/sec + GPM_NVLINK_L14_TX_PER_SEC = 91; // NvLink write bandwidth for link 14 in MiB/sec + GPM_NVLINK_L15_RX_PER_SEC = 92; // NvLink read bandwidth for link 15 in MiB/sec + GPM_NVLINK_L15_TX_PER_SEC = 93; // NvLink write bandwidth for link 15 in MiB/sec + GPM_NVLINK_L16_RX_PER_SEC = 94; // NvLink read bandwidth for link 16 in MiB/sec + GPM_NVLINK_L16_TX_PER_SEC = 95; // NvLink write bandwidth for link 16 in MiB/sec + GPM_NVLINK_L17_RX_PER_SEC = 96; // NvLink read bandwidth for link 17 in MiB/sec + GPM_NVLINK_L17_TX_PER_SEC = 97; // NvLink write bandwidth for link 17 in MiB/sec + GPM_METRIC_MAX = 98; + } + optional string dev_id = 1; + repeated NVMLProbe NVML_metrics = 2; + repeated GPMProbe GPM_metrics = 3; + optional GPUProperties properties = 4; +} \ No newline at end of file diff --git a/resource/proto/mem_metrics.proto b/resource/proto/mem_metrics.proto new file mode 100644 index 0000000..4c73636 --- /dev/null +++ b/resource/proto/mem_metrics.proto @@ -0,0 +1,163 @@ +syntax = "proto3"; + +// === Meminfo related metrics === +message MemBasicMetrics { + optional uint64 mem_total = 1; + optional uint64 mem_free = 2; + optional uint64 mem_available = 3; +} + +message MemKernelCacheMetrics { + optional uint64 buffers = 1; + optional uint64 cached = 2; + optional uint64 swap_cached = 3; +} + +message MemActiveInactiveMetrics { + optional uint64 active = 1; + optional uint64 inactive = 2; + optional uint64 active_anon = 3; + optional uint64 inactive_anon = 4; + optional uint64 active_file = 5; + optional uint64 inactive_file = 6; +} + +message MemNonEvictableMetrics { + optional uint64 unevictable = 1; + optional uint64 mlocked = 2; +} + +message MemSwapMetrics { + optional uint64 swap_total = 1; + optional uint64 swap_free = 2; + optional uint64 zswap_total = 3; + optional uint64 zswapped = 4; +} + +message MemDirtyWritebackMetrics { + optional uint64 dirty = 1; + optional uint64 writeback = 2; +} + +message MemTypeMetrics { + optional uint64 anon_pages = 1; + optional uint64 mapped = 2; + optional uint64 shmem = 3; +} + +message MemKernelMetrics { + optional uint64 kernel_reclaimable = 1; + optional uint64 slab = 2; + optional uint64 slab_reclaimable = 3; + optional uint64 slab_unreclaimable = 4; + optional uint64 kernel_stack = 5; + optional uint64 page_tables = 6; +} + +message MemTmpBufferMetrics { + optional uint64 nfs_unstable = 1; + optional uint64 bounce = 2; + optional uint64 writeback_tmp = 3; +} + +message MemVirtualMetrics { + optional uint64 commit_limit = 1; + optional uint64 committed_as = 2; + optional uint64 vmalloc_total = 3; + optional uint64 vmalloc_used = 4; + optional uint64 vmalloc_chunk = 5; +} + +message MemHugePageMetrics { + optional uint64 anon_huge_pages = 1; + optional uint64 shmem_huge_pages = 2; + optional uint64 shmem_pmd_mapped = 3; + optional uint64 file_huge_pages = 4; + optional uint64 file_pmd_mapped = 5; + optional uint64 hugepages_total = 6; + optional uint64 hugepages_free = 7; + optional uint64 hugepages_rsvd = 8; + optional uint64 hugepages_surp = 9; + optional uint64 hugepages_size = 10; + optional uint64 huge_tlb = 11; +} + +message MemDirectMapMetrics { + optional uint64 direct_map_4k = 1; + optional uint64 direct_map_2m = 2; + optional uint64 direct_map_4m = 3; + optional uint64 direct_map_1g = 4; +} + +message MemMiscMetrics { + optional uint64 percpu = 1; + optional uint64 hardware_corrupted = 2; +} + +// === VMem related metrics === +message VMemZoneMetrics { + optional uint64 nr_free_pages = 1; + optional uint64 nr_zone_inactive_anon = 2; + optional uint64 nr_zone_active_anon = 3; + optional uint64 nr_zone_inactive_file = 4; + optional uint64 nr_zone_active_file = 5; + optional uint64 nr_zone_unevictable = 6; + optional uint64 nr_zone_write_pending = 7; +} + +message VMemNUMAMetrics { + optional uint64 numa_hit = 1; + optional uint64 numa_miss = 2; + optional uint64 numa_foreign = 3; + optional uint64 numa_interleave = 4; + optional uint64 numa_local = 5; + optional uint64 numa_other = 6; +} + +// NOTE: should ALWAYS match the corresponding field numbers in MemInfoMetrics +message MemMetadata { + enum Probe { + INVALID = 0; // placeholder for 0 + MEM_BASIC = 1; + MEM_KERNEL_CACHE = 2; + MEM_ACTIVE_INACTIVE = 3; + MEM_NON_EVICTABLE = 4; + MEM_SWAP = 5; + MEM_DIRTY_WRITEBACK = 6; + MEM_TYPE = 7; + MEM_KERNEL = 8; + MEM_TMP_BUFFER = 9; + MEM_VIRTUAL = 10; + MEM_HUGE_PAGE = 11; + MEM_DIRECT_MAP = 12; + MEM_MISC = 13; + } + repeated Probe probes = 1; +} + +// Refer to https://man7.org/linux/man-pages/man5/proc_meminfo.5.html +// added a level of indirection to allow MemMetadata::Probe to be used as a field number +message MemInfoMetrics { + optional MemBasicMetrics basic_metrics = 1; + optional MemKernelCacheMetrics kernel_cache_metrics = 2; + optional MemActiveInactiveMetrics active_inactive_metrics = 3; + optional MemNonEvictableMetrics non_evictable_metrics = 4; + optional MemSwapMetrics swap_metrics = 5; + optional MemDirtyWritebackMetrics dirty_writeback_metrics = 6; + optional MemTypeMetrics type_metrics = 7; + optional MemKernelMetrics kernel_metrics = 8; + optional MemTmpBufferMetrics tmp_buffer_metrics = 9; + optional MemVirtualMetrics virtual_metrics = 10; + optional MemHugePageMetrics huge_page_metrics = 11; + optional MemDirectMapMetrics direct_map_metrics = 12; + optional MemMiscMetrics misc_metrics = 13; +} + +message MemMetrics { + optional uint64 timestamp = 1; + optional MemInfoMetrics meminfo_metrics = 2; +} + +message MemMetricsTimeSeries { + repeated MemMetrics metrics = 1; +} diff --git a/resource/proto/proc_metrics.proto b/resource/proto/proc_metrics.proto new file mode 100644 index 0000000..9068881 --- /dev/null +++ b/resource/proto/proc_metrics.proto @@ -0,0 +1,89 @@ +/* NOTE: Assumes %d and %u are 32-bit, %ld %lu, %lld, and %llu are 64-bit + * integers on a conventional 64-bit system. */ + +// Refer to https://man7.org/linux/man-pages/man5/proc_pid_stat.5.html +message ProcPIDStatMetrics { + reserved 1, 2, 4, 5, 6, 7, 8, 9, 21, 22; + // [NT] int32 pid = 1; + // [NT] string comm = 2; + optional uint32 state = 3; // stored in uint32, but is actually a char + // [NT] int32 ppid = 4; + // [NT] int32 pgrp = 5; + // [NT] int32 session = 6; + // [NT] int32 tty_nr = 7; + // [NT] int32 tpgid = 8; + // [NT] uint64 flags = 9; + optional uint64 minflt = 10; + optional uint64 cminflt = 11; + optional uint64 majflt = 12; + optional uint64 cmajflt = 13; + optional uint64 utime = 14; + optional uint64 stime = 15; + optional int64 cutime = 16; + optional int64 cstime = 17; + optional int64 priority = 18; + optional int64 nice = 19; + optional uint64 num_threads = 20; + // [NT] int64 itrealvalue = 21; + // [NT] uint64 starttime = 22; + optional uint64 vsize = 23; +} + +// Refer to https://man7.org/linux/man-pages/man5/proc_pid_statm.5.html +// stats measured in pages +message ProcPIDStatmMetrics { + optional uint64 size = 1; + optional uint64 resident = 2; + optional uint64 share = 3; + optional uint64 text = 4; + optional uint64 lib = 5; + optional uint64 data = 6; + optional uint64 dt = 7; +} + +// Refer to https://man7.org/linux/man-pages/man5/proc_pid_io.5.html +/* If the process is spawned by a privileged user or by some system-level + * service (e.g., docker), other processes may not have access to /proc//io + * files, and reading it would result in "Permission denied". In this case, it + * is better to place probe on corresponding cgroup (in docker case). + * Do !!!NOT!!! spawn the monitor using root privileges, it is not currently + * tested */ +message ProcPIDIOMetrics { + optional uint64 rchar = 1; // bytes read + optional uint64 wchar = 2; // bytes written + optional uint64 syscr = 3; // read syscalls + optional uint64 syscw = 4; // write syscalls + optional uint64 read_bytes = 5; // bytes read from disk + optional uint64 write_bytes = 6; // bytes written to disk + optional uint64 cancelled_write_bytes = 7; // bytes written that were cancelled +} + +message PerProcMetrics { + optional ProcPIDStatMetrics pid_stat_metrics = 1; + optional ProcPIDStatmMetrics pid_statm_metrics = 2; + optional ProcPIDIOMetrics pid_io_metrics = 3; +} + +message ProcMetrics { + optional uint64 timestamp = 1; + repeated PerProcMetrics per_proc_metrics = 2; +} + +message ProcMetricsTimeSeries { + repeated ProcMetrics metrics = 1; +} + +message ProcMetadata { + enum Probe { + STAT = 0; + STATM = 1; + IO = 2; + }; + optional string proc_name = 1; + optional uint64 pid = 2; + repeated Probe probes = 3; +} + +message ProcMetricsMetadata { + repeated ProcMetadata proc_meta = 1; +} \ No newline at end of file diff --git a/resource/requirements.in b/resource/requirements.in new file mode 100644 index 0000000..94e6c54 --- /dev/null +++ b/resource/requirements.in @@ -0,0 +1,37 @@ +# monitoring +nvidia-ml-py +pybind11-stubgen +absl-py + +# LLM inference and serving +vllm==0.8.0 +sentence_transformers +ragas +datasets + +# pytorch +torch +torchvision + +# PDF pipeline +docling + +# vectorDB +milvus==2.3.5 +pymilvus==2.3.7 +lancedb==0.24.3 +qdrant-client +elasticsearch +chromadb + +# evaluator +FlagEmbedding +optimum + +# image +pdf2image +colpali_engine +qwen_vl_utils + +# plot +matplotlib diff --git a/resource/setup.sh b/resource/setup.sh new file mode 100644 index 0000000..5b92467 --- /dev/null +++ b/resource/setup.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -x + +# HuggingFace Env Var +# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables +# prevent autodownload +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 + +set +x diff --git a/script/run_insert.sh b/script/run_insert.sh new file mode 100755 index 0000000..20f0f9e --- /dev/null +++ b/script/run_insert.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# Check if a directory was passed as an argument +if [ -z "$1" ]; then + echo "Usage: $0 " + exit 1 +fi + +BASE_DIR="$1" + +# Check if the directory exists +if [ ! -d "$BASE_DIR" ]; then + echo "Directory not found: $BASE_DIR" + exit 1 +fi + +find "$BASE_DIR" -type f -name "*.yaml" | sort | while read -r yaml_file; do + echo "Running: $yaml_file" + + python3 src/run.py --config "$yaml_file" + wait $! + + # Check for failure + if [ $? -ne 0 ]; then + echo "Failed on: $yaml_file" + exit 1 + fi +done \ No newline at end of file diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..f7af39e --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,9 @@ +# Generated Product +monitoring_sys/*.cpython* +monitoring_sys/*.pyi + +proto/* + +*.bin +*.yaml +*.log \ No newline at end of file diff --git a/src/RAGPipeline/BaseRAGPipline.py b/src/RAGPipeline/BaseRAGPipline.py new file mode 100644 index 0000000..24a6eb5 --- /dev/null +++ b/src/RAGPipeline/BaseRAGPipline.py @@ -0,0 +1,17 @@ +from abc import ABC, abstractmethod + + +# should make the pipeline fully modular with request queue passing + +# class ModularRAGPipeline(ABC): +# def __init__(self, **kwargs): +# # self.run_name = kwargs.get("run_name", "default_run") + + +class BaseRAGPipeline(ABC): + def __init__(self): + pass + + @abstractmethod + def process(self, request, batch_size=1) -> None: + pass diff --git a/src/RAGPipeline/ImageRAGPipline.py b/src/RAGPipeline/ImageRAGPipline.py new file mode 100644 index 0000000..e2f9b13 --- /dev/null +++ b/src/RAGPipeline/ImageRAGPipline.py @@ -0,0 +1,143 @@ +from abc import ABC, abstractmethod +import os +import time +import math +from RAGPipeline.responser.TextsResponser import VLLMResponser +from RAGPipeline.BaseRAGPipline import BaseRAGPipeline +from encoder.sentenceTransformerEncoder import SentenceTransformerEncoder +from RAGPipeline.retriever.BaseRetriever import BaseRetriever +from RAGPipeline.reranker.CrossEncoderReranker import CrossEncoderReranker +from evaluator.RagasEvaluator import RagasEvaluator +from datasets import Dataset +import utils.colored_print as cprint +from utils.logger import Logger, log_time_breakdown +from qwen_vl_utils import process_vision_info + + +# should make the pipeline fully modular with request queue passing + +# class ModularRAGPipeline(ABC): +# def __init__(self, **kwargs): +# # self.run_name = kwargs.get("run_name", "default_run") + + +class ImagesRAGPipeline(BaseRAGPipeline): + def __init__( + self, + retriever: BaseRetriever, + responser: VLLMResponser, + embedder: SentenceTransformerEncoder, + evaluator: RagasEvaluator = None, + ) -> None: + + self.retriever = retriever + self.responser = responser + self.embedder = embedder + return + + def generate_prompt(self, questions, contexts): + chat_template = [ + { + "role": "user", + "content": [{"type": "image", "image": image} for image in contexts] + + [{"type": "text", "text": questions}], + } + ] + return chat_template + + def process(self, request, batch_size=2) -> None: + if request.req_type == "query": + cprint.iprintf( + f"*** Processing {request.req_count} questions with batch size {batch_size}" + ) + # load models + log_time_breakdown("start") + cprint.iprintf(f"*** Loading models") + self.embedder.load_encoder() + self.responser.load_llm() + cprint.iprintf(f"*** Loading models done") + + nrounds = int(math.ceil(request.req_count / batch_size)) + cprint.iprintf(f"*** Will run {nrounds} rounds") + for round_idx in range(0, nrounds): + start_sample_idx = round_idx * batch_size + questions, gt_answer = request.get_questions(batch_size, start_idx=start_sample_idx) + print(f"***Processing {request.req_count} questions") + for i in range(0, request.req_count, batch_size): + questions, gt_answer = request.get_questions(batch_size, start_idx=i) + + # encode questions TODO: parameter + # Embedding chunked texts + # self.embedder.load_encoder() + log_time_breakdown("embed") + embedding_start_time = time.monotonic_ns() + vectors = self.embedder.embedding_query(questions) + embedding_end_time = time.monotonic_ns() + # self.embedder.free_encoder() + cprint.iprintf(f"*** Embedding done") + + for i, query in enumerate(vectors): + query = query.float().numpy() + # retrieval + log_time_breakdown("retrieve") + retrieval_start_time = time.monotonic_ns() + results = self.retriever.search_db_image(query) + retrieval_end_time = time.monotonic_ns() + cprint.iprintf(f"*** Retrieval done") + + # augment + log_time_breakdown("prompt") + prompt_start_time = time.monotonic_ns() + prompts = self.generate_prompt(questions[i], results) + prompt_end_time = time.monotonic_ns() + cprint.iprintf(f"*** Prompt generation done") + with open("prompt.out", "w") as fout: + for idx, prompt in enumerate(prompts): + fout.write(f"=== Prompt {idx + 1} ===\n") + fout.write(str(prompts) + "\n\n") + + # generation + cprint.iprintf(f"*** Generating answers") + # self.responser.load_llm() + log_time_breakdown("generate") + generation_start_time = time.monotonic_ns() + responses = self.responser.query_llm(prompts) + generation_end_time = time.monotonic_ns() + # self.responser.free_llm() + cprint.iprintf(f"*** Generation done") + + with open("response.out", "w") as fout: + for idx, response in enumerate(responses): + fout.write(f"=== response {idx + 1} ===\n") + fout.write(response.strip() + "\n\n") + log_time_breakdown("free_models") + + # finished + cprint.iprintf(f"*** Unloading models") + self.embedder.free_encoder() + self.responser.free_llm() + cprint.iprintf(f"*** Unloading models done") + log_time_breakdown("done") + embedding_time = embedding_end_time - embedding_start_time + retrieval_time = retrieval_end_time - retrieval_start_time + prompt_time = prompt_end_time - prompt_start_time + generation_time = generation_end_time - generation_start_time + total_time = embedding_time + retrieval_time + prompt_time + generation_time + print( + f"At round {round_idx}\n" + f" embedding time: {embedding_time} ns ({embedding_time / 1e9} s, ({embedding_time / total_time * 100:.2f}%)\n" + f" retrieval time: {retrieval_time} ns ({retrieval_time / 1e9} s, ({retrieval_time / total_time * 100:.2f}%)\n" + f" prompt time: {prompt_time} ns ({prompt_time / 1e9} s, ({prompt_time / total_time * 100:.2f}%)\n" + f" generation time: {generation_time} ns ({generation_time / 1e9} s, ({generation_time / total_time * 100:.2f}%)\n" + ) + output_path = os.path.join(Logger().log_dirpath, "text_pipeline_stats.txt") + with open(output_path, "a") as fout: + fout.write( + f"{round_idx}\t" + f"{embedding_time}\t" + f"{retrieval_time}\t" + f"{prompt_time}\t" + f"{generation_time}\t" + f"{total_time}\n" + ) + return diff --git a/src/RAGPipeline/TextsRAGPipline.py b/src/RAGPipeline/TextsRAGPipline.py new file mode 100644 index 0000000..82d5688 --- /dev/null +++ b/src/RAGPipeline/TextsRAGPipline.py @@ -0,0 +1,198 @@ +from abc import ABC, abstractmethod +import os +import time +import math +from RAGPipeline.responser.TextsResponser import VLLMResponser +from RAGPipeline.BaseRAGPipline import BaseRAGPipeline +from encoder.sentenceTransformerEncoder import SentenceTransformerEncoder +from RAGPipeline.retriever.BaseRetriever import BaseRetriever +from RAGPipeline.reranker.CrossEncoderReranker import CrossEncoderReranker +from evaluator.RagasEvaluator import RagasEvaluator +from datasets import Dataset +import utils.colored_print as cprint +from utils.logger import Logger, log_time_breakdown + +# should make the pipeline fully modular with request queue passing + +# class ModularRAGPipeline(ABC): +# def __init__(self, **kwargs): +# # self.run_name = kwargs.get("run_name", "default_run") + + +class TextsRAGPipeline(BaseRAGPipeline): + def __init__( + self, + retriever: BaseRetriever, + responser: VLLMResponser, + embedder: SentenceTransformerEncoder, + reranker: CrossEncoderReranker = None, + evaluator: RagasEvaluator = None, + ) -> None: + + self.retriever = retriever + self.reranker = reranker + self.responser = responser + self.embedder = embedder + self.evaluator = evaluator + return + + def generate_prompt(self, questions, contexts): + context_format = """Source #{source_idx}\nDetail: {source_detail}\n""" + SYSTEM_PROMPT = """ + First, check if the provided Context is relevant to the user's question. + Second, only if the provided Context is strongly relevant, answer the question using the Context. + Otherwise, if the Context is not strongly relevant, IGNORE THEM COMPLETELY and answer the question from your own knowledge. You MUST NOT say anything about relevence or missing information or say phrase like 'the text does not discuss'. + There are totally {n_ctx} contexts, each in format of "{ctx_fmt}" + Context: {contexts_combined} + User's question: {question} + Your answer starts from here + """ + prompts = [] + for i, question in enumerate(questions): + prompts.append( + SYSTEM_PROMPT.format( + n_ctx=len(contexts[i]), + ctx_fmt=context_format, + contexts_combined="\n".join(contexts[i]), + question=question, + ) + ) + return prompts + + def process(self, request, batch_size=2) -> None: + if request.req_type == "query": + cprint.iprintf( + f"*** Processing {request.req_count} questions with batch size {batch_size}" + ) + log_time_breakdown("start") + cprint.iprintf(f"*** Loading models") + self.embedder.load_encoder() + if self.reranker is not None: + self.reranker.load_reranker() + self.responser.load_llm() + cprint.iprintf(f"*** Loading models done") + + nrounds = int(math.ceil(request.req_count / batch_size)) + cprint.iprintf(f"*** Will run {nrounds} rounds") + for round_idx in range(0, nrounds): + start_sample_idx = round_idx * batch_size + questions, gt_answer = request.get_questions(batch_size, start_idx=start_sample_idx) + print(f"***Processing {request.req_count} questions") + user_input_list = [] + response_list = [] + retrieved_contexts_list = [] + reference_list = [] + for i in range(0, request.req_count, batch_size): + questions, gt_answer = request.get_questions(batch_size, start_idx=i) + + # encode questions TODO: parameter + # Embedding chunked texts + # self.embedder.load_encoder() + log_time_breakdown("embed") + embedding_start_time = time.monotonic_ns() + vectors = self.embedder.embedding(questions) + embedding_end_time = time.monotonic_ns() + # self.embedder.free_encoder() + cprint.iprintf(f"*** Embedding done") + + # retrieval + log_time_breakdown("retrieve") + retrieval_start_time = time.monotonic_ns() + results = self.retriever.search_db(vectors) + retrieval_end_time = time.monotonic_ns() + cprint.iprintf(f"*** Retrieval done") + # rerank + if self.reranker is not None: + cprint.iprintf( + f"*** Reranking top-{self.reranker.top_n} from {self.retriever.top_k} candidates" + ) + # self.reranker.load_reranker() + log_time_breakdown("rerank") + rerank_start_time = time.monotonic_ns() + # print(results) + results = self.reranker.batch_rerank(questions, results) + rerank_end_time = time.monotonic_ns() + # self.reranker.free_reranker() + cprint.iprintf(f"*** Reranking done") + + # augment + log_time_breakdown("prompt") + prompt_start_time = time.monotonic_ns() + prompts = self.generate_prompt(questions, results) + prompt_end_time = time.monotonic_ns() + cprint.iprintf(f"*** Prompt generation done") + # with open("prompt.out", "w") as fout: + # for idx, prompt in enumerate(prompts): + # fout.write(f"=== Prompt {idx + 1} ===\n") + # fout.write(prompt.strip() + "\n\n") + + # generation + cprint.iprintf(f"*** Generating answers") + # self.responser.load_llm() + log_time_breakdown("generate") + generation_start_time = time.monotonic_ns() + responses = self.responser.query_llm(prompts) + # response = [] + generation_end_time = time.monotonic_ns() + # self.responser.free_llm() + cprint.iprintf(f"*** Generation done") + + # with open("response.out", "w") as fout: + # for idx, response in enumerate(responses): + # fout.write(f"=== response {idx + 1} ===\n") + # fout.write(response.strip() + "\n\n") + + user_input_list.extend(questions) + response_list.extend(responses) + retrieved_contexts_list.extend(results) + reference_list.extend(gt_answer) + + evaluate_dataset = Dataset.from_dict( + { + 'user_input': user_input_list, + 'response': response_list, + 'retrieved_contexts': retrieved_contexts_list, + 'reference': reference_list, + } + ) + # finished + log_time_breakdown("free_models") + cprint.iprintf(f"*** Unloading models") + self.embedder.free_encoder() + if self.reranker is not None: + self.reranker.free_reranker() + self.responser.free_llm() + cprint.iprintf(f"*** Unloading models done") + log_time_breakdown("done") + if self.evaluator is not None: + print(f"***Evaluating answers") + self.evaluator.evaluate_dataset(evaluate_dataset) + + embedding_time = embedding_end_time - embedding_start_time + retrieval_time = retrieval_end_time - retrieval_start_time + rerank_time = rerank_end_time - rerank_start_time if self.reranker is not None else 0 + prompt_time = prompt_end_time - prompt_start_time + generation_time = generation_end_time - generation_start_time + total_time = ( + embedding_time + retrieval_time + rerank_time + prompt_time + generation_time + ) + print( + f"At round {round_idx}\n" + f" embedding time: {embedding_time} ns ({embedding_time / 1e9} s, ({embedding_time / total_time * 100:.2f}%)\n" + f" retrieval time: {retrieval_time} ns ({retrieval_time / 1e9} s, ({retrieval_time / total_time * 100:.2f}%)\n" + f" rerank time: {rerank_time} ns ({rerank_time / 1e9} s, ({rerank_time / total_time * 100:.2f}%)\n" + f" prompt time: {prompt_time} ns ({prompt_time / 1e9} s, ({prompt_time / total_time * 100:.2f}%)\n" + f" generation time: {generation_time} ns ({generation_time / 1e9} s, ({generation_time / total_time * 100:.2f}%)\n" + ) + output_path = os.path.join(Logger().log_dirpath, "text_pipeline_stats.txt") + with open(output_path, "a") as fout: + fout.write( + f"{round_idx}\t" + f"{embedding_time}\t" + f"{retrieval_time}\t" + f"{rerank_time}\t" + f"{prompt_time}\t" + f"{generation_time}\t" + f"{total_time}\n" + ) + return diff --git a/src/RAGPipeline/__init__.py b/src/RAGPipeline/__init__.py new file mode 100644 index 0000000..bd6586e --- /dev/null +++ b/src/RAGPipeline/__init__.py @@ -0,0 +1,2 @@ +# RAGPipeline/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/RAGPipeline/reranker/BaseReranker.py b/src/RAGPipeline/reranker/BaseReranker.py new file mode 100644 index 0000000..b72c98a --- /dev/null +++ b/src/RAGPipeline/reranker/BaseReranker.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +import torch + + +class BaseReranker(ABC): + def __init__(self, device=None): + pass + + @abstractmethod + def load_reranker(self): + pass + + @abstractmethod + def rerank(self, query, candidate_docs): + pass + + @abstractmethod + def batch_rerank(self, queries, candidate_docs_list): + pass + + @abstractmethod + def free_reranker(self): + pass diff --git a/src/RAGPipeline/reranker/CrossEncoderReranker.py b/src/RAGPipeline/reranker/CrossEncoderReranker.py new file mode 100644 index 0000000..b9e222d --- /dev/null +++ b/src/RAGPipeline/reranker/CrossEncoderReranker.py @@ -0,0 +1,58 @@ +import torch, gc +from sentence_transformers import CrossEncoder +from RAGPipeline.reranker.BaseReranker import BaseReranker +from typing import List + + +class CrossEncoderReranker(BaseReranker): + def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", top_n=5, device=None): + super().__init__() + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.model_name = model_name + self.top_n = top_n + + def load_reranker(self): + self.model = CrossEncoder(self.model_name, device=self.device) + + def rerank(self, query, candidate_docs): + pairs = [(query, doc) for doc in candidate_docs] + scores = self.model.predict(pairs) + ranked = sorted(zip(candidate_docs, scores), key=lambda x: x[1], reverse=True) + return [doc for doc, _ in ranked[: self.top_n]] + + def batch_rerank(self, queries: List[str], candidate_docs_list: List[List[str]]): + """ + queries: List[str], candidate_docs_list: List[List[str]] + returns: List[List[str]] - top-k reranked document texts per query + """ + assert len(queries) == len(candidate_docs_list), "Length mismatch" + + all_pairs = [] + index_ranges = [] + current = 0 + + for query, docs in zip(queries, candidate_docs_list): + pairs = [(query, doc) for doc in docs] + all_pairs.extend(pairs) + index_ranges.append((current, current + len(docs))) + current += len(docs) + + all_scores = self.model.predict(all_pairs, batch_size=1) + results = [] + + for (start, end), docs in zip(index_ranges, candidate_docs_list): + scores = all_scores[start:end] + ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True) + results.append([doc for doc, _ in ranked[: self.top_n]]) + + return results + + def free_reranker(self): + del self.model + torch.cuda.synchronize() + gc.collect() + torch.cuda.empty_cache() + try: + torch.cuda.ipc_collect() + except Exception: + pass diff --git a/src/RAGPipeline/reranker/__init__.py b/src/RAGPipeline/reranker/__init__.py new file mode 100644 index 0000000..ef38576 --- /dev/null +++ b/src/RAGPipeline/reranker/__init__.py @@ -0,0 +1,2 @@ +# reranker/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/RAGPipeline/responser/BaseResponser.py b/src/RAGPipeline/responser/BaseResponser.py new file mode 100644 index 0000000..2deacd1 --- /dev/null +++ b/src/RAGPipeline/responser/BaseResponser.py @@ -0,0 +1,21 @@ +from abc import ABC, abstractmethod + + +class BaseResponser(ABC): + def __init__(self, device=None): + pass + + def __del__(self): + pass + + @abstractmethod + def load_llm(self) -> None: + pass + + @abstractmethod + def free_llm(self) -> None: + pass + + @abstractmethod + def query_llm(self, prompts, max_tokens=500, temperature=0.7, top_p=0.9) -> list[str]: + pass diff --git a/src/RAGPipeline/responser/ImagesResponser.py b/src/RAGPipeline/responser/ImagesResponser.py new file mode 100644 index 0000000..ede394a --- /dev/null +++ b/src/RAGPipeline/responser/ImagesResponser.py @@ -0,0 +1,71 @@ +import torch, gc +from vllm import LLM, SamplingParams +from RAGPipeline.responser.BaseResponser import BaseResponser +from qwen_vl_utils import process_vision_info +from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor + + +class ImageResponser(BaseResponser): + def __init__(self, model="Qwen/Qwen2-VL-7B-Instruct", device="cuda:0"): + self.model_name = model + self.device = device + self.llm = None + return + + def load_llm(self): + print(f"***Loading LLM: {self.model_name} on {self.device}") + self.vl_model = Qwen2VLForConditionalGeneration.from_pretrained( + self.model_name, + device_map=self.device, + ) + self.vl_model.cuda().eval() + min_pixels = 224 * 224 + max_pixels = 1024 * 1024 + self.vl_model_processor = Qwen2VLProcessor.from_pretrained( + self.model_name, + min_pixels=min_pixels, + max_pixels=max_pixels, + device_map=self.device, + ) + print(f"***Loaded LLM: {self.model_name}") + return + + def free_llm(self): + del self.vl_model + self.llm = None + del self.vl_model_processor + self.vl_model_processor = None + gc.collect() + torch.cuda.empty_cache() + try: + torch.cuda.ipc_collect() + except Exception: + pass + + def query_llm(self, prompts, max_tokens=500): + # Prepare the inputs + text = self.vl_model_processor.apply_chat_template( + prompts, tokenize=False, add_generation_prompt=True + ) + image_inputs, video_inputs = process_vision_info(prompts) + inputs = self.vl_model_processor( + text=[text], + images=image_inputs, + padding=True, + return_tensors="pt", + ) + inputs = inputs.to("cuda") + + # Generate text from the vl_model + generated_ids = self.vl_model.generate(**inputs, max_new_tokens=max_tokens) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + + # Decode the generated text + output_text = self.vl_model_processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + print("***answer:") + print(output_text[0]) + return output_text diff --git a/src/RAGPipeline/responser/TextsResponser.py b/src/RAGPipeline/responser/TextsResponser.py new file mode 100644 index 0000000..8346254 --- /dev/null +++ b/src/RAGPipeline/responser/TextsResponser.py @@ -0,0 +1,54 @@ +import torch, gc +from vllm import LLM, SamplingParams +from RAGPipeline.responser.BaseResponser import BaseResponser + + +class VLLMResponser(BaseResponser): + def __init__(self, model="Qwen/Qwen2.5-7B-Instruct", device="cuda:0", parallelism=1): + self.model_name = model + self.device = device + self.llm = None + self.parallelism = parallelism + return + + def load_llm(self): + if self.llm is not None: + print(f"***LLM already loaded: {self.model_name}") + return + print(f"***Loading LLM: {self.model_name} on {self.device}") + self.llm = LLM( + model=self.model_name, + enforce_eager=True, + # device=self.device, + dtype=torch.bfloat16, + trust_remote_code=True, + gpu_memory_utilization=0.85, + max_model_len=8096, + tensor_parallel_size=self.parallelism, + ) + print(f"***Loaded LLM: {self.model_name}") + + def free_llm(self): + del self.llm + self.llm = None + gc.collect() + torch.cuda.empty_cache() + try: + torch.cuda.ipc_collect() + except Exception: + pass + + def query_llm(self, prompts, max_tokens=1024, temperature=0.7, top_p=0.9): + # make load and free out of this function + + sampling_params = SamplingParams( + max_tokens=max_tokens, temperature=temperature, top_p=top_p + ) + + # batch process + assert self.llm is not None, "Query called when LLM is not loaded" + results = self.llm.generate(prompts, sampling_params) + assert len(results) == len( + prompts + ), f"Mismatch detected, generated {len(results)} responses for {len(prompts)} prompts" + return [res.outputs[0].text for res in results] diff --git a/src/RAGPipeline/responser/__init__.py b/src/RAGPipeline/responser/__init__.py new file mode 100644 index 0000000..0a7cd3b --- /dev/null +++ b/src/RAGPipeline/responser/__init__.py @@ -0,0 +1,2 @@ +# responser/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/RAGPipeline/retriever/BaseRetriever.py b/src/RAGPipeline/retriever/BaseRetriever.py new file mode 100644 index 0000000..a455ba7 --- /dev/null +++ b/src/RAGPipeline/retriever/BaseRetriever.py @@ -0,0 +1,108 @@ +import time +import os +from abc import ABC, abstractmethod +from vectordb.milvus_api import milvus_client +import concurrent.futures +import numpy as np +from PIL import Image + + +class BaseRetriever(ABC): + def __init__( + self, collection_name, top_k=5, retrieval_batch_size=1, client: milvus_client = None + ): + + # Retrieval + self.top_k = top_k + self.collection_name = collection_name + self.retrieval_batch_size = retrieval_batch_size + + # DB + self.client = client + + def search_db(self, query_embeddings): + # self.client.load_collection(self.collection_name) + + # results = [] + batch_size = self.retrieval_batch_size + results = self.client.query_search( + query_embeddings, + self.top_k, + collection_name=self.collection_name, + search_batch_size=batch_size, + multithread=True, + max_threads=1, + consistency_level="Eventually", + ) + # self._release_collections() + + return results + + def search_db_image(self, query_embeddings): + # Perform a vector search on the collection to find the top-k most similar documents. + # topk set to a reasonable large num + # results = self.db_client.query_search(embeddings, topk=50, collection_name=self.collection_name, output_fields=["vector", "seq_id", "doc_id", "filepath"]) + # search_params = {"metric_type": "IP", "params": {}} + batch_size = self.retrieval_batch_size + results = self.client.query_search_image( + query_embeddings, + int(50), + search_batch_size=batch_size, + collection_name=self.collection_name, + output_fields=["vector", "seq_id", "doc_id", "filepath"], + # search_params=search_params, + ) + + scores = [] + + def rerank_single_doc(doc_id, data, client, collection_name): + # Rerank a single document by retrieving its embeddings and calculating the similarity with the query. + doc_colbert_vecs = client.query( + collection_name=collection_name, + filter_expr=f"doc_id in ({doc_id})", + output_fields=["seq_id", "vector", "filepath"], + limit=1000, + ) + if client.type == "lancedb": + doc_vecs = np.vstack(doc_colbert_vecs["vector"].to_list()) + score = np.dot(data, doc_vecs.T).max(1).sum() + return (score, doc_id, doc_colbert_vecs["filepath"][0]) + elif client.type == "milvus": + doc_vecs = np.vstack([data["vector"] for data in doc_colbert_vecs]) + return (score, doc_id, doc_colbert_vecs[0]["filepath"]) + else: + raise ValueError(f"Unsupported client type: {client.type}") + + with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor: + futures = { + executor.submit( + rerank_single_doc, doc_id, query_embeddings, self.client, self.collection_name + ): doc_id + for doc_id in results + } + for future in concurrent.futures.as_completed(futures): + score, doc_id, filepath = future.result() + scores.append((score, doc_id, filepath)) + + scores.sort(key=lambda x: x[0], reverse=True) + + def GetPDF(filepath): + """ + Loads and returns the image at the given filepath. + """ + if os.path.exists(filepath): + image = Image.open(filepath) + return image + else: + print(f"File does not exist: {filepath}") + return None + + images_list = [] + if len(scores) >= self.top_k: + scores[: self.top_k] + for hits in scores[: self.top_k]: + images_list.append(GetPDF(hits[2])) + else: + for hits in scores: + images_list.append(GetPDF(hits[2])) + return images_list diff --git a/src/RAGPipeline/retriever/__init__.py b/src/RAGPipeline/retriever/__init__.py new file mode 100644 index 0000000..e5774d9 --- /dev/null +++ b/src/RAGPipeline/retriever/__init__.py @@ -0,0 +1,2 @@ +# retriever/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/RAGRequest/BaseRAGRequest.py b/src/RAGRequest/BaseRAGRequest.py new file mode 100644 index 0000000..4affdb1 --- /dev/null +++ b/src/RAGRequest/BaseRAGRequest.py @@ -0,0 +1,29 @@ +from abc import ABC, abstractmethod + + +# the class for gnerated workloads without worload mix +class BaseRAGRequest(ABC): + def __init__(self, run_name, collection_name, req_type, dataset_name, req_count): + self.run_name = run_name + self.collection_name = collection_name + self.dataset_name = dataset_name + # self.dataprocessor = kwargs.get("dataprocessor") + self.req_type = req_type + self.req_count = req_count + if req_type not in ["query", "update"]: + raise ValueError(f"Invalid request type: {req_type}. Must be 'query' or 'update'.") + # if req_type == "query": + # self.query_list = kwargs.get("query_list", None) + # # self.query_format = kwargs.get("query_format", None) + # self.ground_truth = kwargs.get("ground_truth", None) + # elif req_type == "update": + # print("Update request type is not implemented yet.") + # self.req_count = kwargs.get("req_count") + + @abstractmethod + def init_requests(self): + pass + + @abstractmethod + def get_questions(self): + pass diff --git a/src/RAGRequest/TextsRAGRequest.py b/src/RAGRequest/TextsRAGRequest.py new file mode 100644 index 0000000..a7dd856 --- /dev/null +++ b/src/RAGRequest/TextsRAGRequest.py @@ -0,0 +1,40 @@ +import datasets +from RAGRequest.BaseRAGRequest import BaseRAGRequest + + +class WikipediaRequests(BaseRAGRequest): + def __init__(self, run_name, collection_name, req_type, req_count): + # Ensure dataset_name is fixed to "wikimedia/wikipedia" + dataset_name = "wikimedia/wikipedia" + self.query_list = None + super().__init__(run_name, collection_name, req_type, dataset_name, req_count) + + def init_requests(self, num): + if self.req_type == "query": + questions, gt_answers = self.get_questions(num) + query_list = {"questions": questions, "ground_truth_answers": gt_answers} + # ground_truth = data_processor.get_ground_truth(questions, gt_answers) + + def get_questions(self, batch_size, start_idx=0): + if self.req_type != "query": + raise ValueError("This request type is not supported for question retrieval.") + if self.query_list is not None: + questions = self.query_list["questions"][start_idx : start_idx + batch_size] + gt_answers = self.query_list["ground_truth_answers"][start_idx : start_idx + batch_size] + return questions, gt_answers + else: + try: + ds = datasets.load_dataset("sentence-transformers/natural-questions", split="train") + except ConnectionError as e: + if datasets.config.HF_DATASETS_OFFLINE: + print( + "***Dataset autodownload disabled and no dataset is found under " + f"HF_CACHE_HOME: <{datasets.config.HF_CACHE_HOME}>" + ) + raise e + + # Extract questions and answers + questions = ds["query"][start_idx : start_idx + batch_size] + gt_answers = ds["answer"][start_idx : start_idx + batch_size] + + return questions, gt_answers diff --git a/src/RAGRequest/__init__.py b/src/RAGRequest/__init__.py new file mode 100644 index 0000000..0e3848d --- /dev/null +++ b/src/RAGRequest/__init__.py @@ -0,0 +1,2 @@ +# RAGRequest/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..58a4f30 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,2 @@ +# src/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..28c677d --- /dev/null +++ b/src/config.py @@ -0,0 +1,156 @@ +import re +import os +import yaml + +# config should be three parts: +# 1. general config +# 2. pipeline config +# 3. benchmark config +deafult_runname = "default_run" + +DEFAULT_SYS_CONFIG = { + "devices": { + "cpu": "cpu", + "gpus": ["cuda:0", "cuda:1"], + "gpu_count": 2, + }, + "vector_db": { + "type": "milvus", + "db_path": "http://localhost:19530", + "db_token": "root:Milvus", + "collection_name": "", + "drop_previous_collection": False, + }, + "log": { + "metrics_log": "./log/default_run.log", + }, +} + +DEFAULT_RAG_CONFIG = { + "action": { + "preprocess": True, + "embedding": True, + "insert": False, + "build_index": False, + "retrieval": False, + "reranking": False, + "generation": False, + "evaluate": False, + }, + # ingest part + "embedding": { + "model": "nomic-ai/nomic-embed-text-v2-moe", + "batch_size": 128, + "embedding_framework": "sentence_transformers", # + "sentence_transformers_name": "all-MiniLM-L6-v2", # + }, + "insert": { + "batch_size": 512, + "drop_previous_collection": False, + "collection_name": "", + }, + "build_index": { + "index_type": "IVF_FLAT", + "metric_type": "L2", + }, + # retrieval part + "retrieval": { + "top_k": 5, + "question_num": 1, + "retrieval_batch_size": 1, + }, + "reranking": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "device": "cuda:0", + }, + # generation + "generation": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "device": "cuda:0", + }, + "evaluate": { + "evaluator_model": "ragdata/Qwen2-7B-Instruct-GPTQ-Int8", + "evaluator_embedding": "ragdata/bge-large-zh-v1.5", + }, +} + +DEFAULT_BENCHMARK_CONFIG = { + "dataset": "wikimedia/wikipedia", + "preprocessing": { + "chunktype": "length", + "chunk_size": 512, + "chunk_overlap": 0, + "dataset_ratio": 0.01, + }, +} + + +def load_config(config_path): + # check + if not os.path.exists(config_path): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + if not config_path.endswith(".yaml"): + raise ValueError("Config file should be an yaml file") + # load + print(f"load config file: {config_path}") + with open(config_path, "r") as file: + return yaml.safe_load(file) + + +def config_to_log_path(config_path="config/config.yaml") -> str: + if not config_path.endswith(".yaml"): + raise ValueError("Config path must end with .yaml") + + log_path = config_path.replace("config", "log").replace(".yaml", ".log") + return log_path + + +def get_db_collection_name( + name, + replacement_str="_", +): + # replace everything that is not a number, letter, or underscore with replacement_str + pattern = r"[^\w\d_]+" + occurrences = [(m.start(0), m.end(0)) for m in re.finditer(pattern, name)] + occurrences_sorted = sorted(occurrences, key=lambda inst: inst[0]) + + # look for continuous invalid strings + substring_sorted = [] + last_substring_start = 0 + for occ_start, occ_end in occurrences_sorted: + substring_sorted.append((last_substring_start, occ_start)) + last_substring_start = occ_end + substring_sorted.append((last_substring_start, len(name))) + + # replace them by ignoring them on concatenation + collection_name = name[substring_sorted[0][0] : substring_sorted[0][1]] + for inst in substring_sorted[1:]: + collection_name += replacement_str + name[inst[0] : inst[1]] + + return collection_name + + +def output_config(config, output_path): + """ + Save the configuration to a YAML file. + """ + if not output_path.endswith(".yaml"): + raise ValueError("Output path must end with .yaml") + # if not os.path.exists(os.path.dirname(output_path)): + + with open(output_path, "w") as file: + yaml.dump(config, file, default_flow_style=False) + print(f"Configuration saved to {output_path}") + + +def generate_default_config(): + config = { + "run_name": deafult_runname, + "sys": DEFAULT_SYS_CONFIG, + "rag": DEFAULT_RAG_CONFIG, + "bench": DEFAULT_BENCHMARK_CONFIG, + } + return config + + +# output_config(generate_default_config(), "./config/example.yaml") diff --git a/src/datasetLoader/BaseDatasetLoader.py b/src/datasetLoader/BaseDatasetLoader.py new file mode 100644 index 0000000..fbf4ec3 --- /dev/null +++ b/src/datasetLoader/BaseDatasetLoader.py @@ -0,0 +1,14 @@ +from abc import ABC, abstractmethod +import pandas as pd + + +class BaseDatasetLoader(ABC): + def __init__(self, dataset_name) -> None: + self.dataset_name = dataset_name + return + + @abstractmethod + def get_dataset_slice(self, length, offset) -> pd.DataFrame: # change to offset + pass + + # TODO add a dataset free diff --git a/src/datasetLoader/PDFDatasetLoader.py b/src/datasetLoader/PDFDatasetLoader.py new file mode 100644 index 0000000..e6ee40f --- /dev/null +++ b/src/datasetLoader/PDFDatasetLoader.py @@ -0,0 +1,103 @@ +from datasetLoader.BaseDatasetLoader import BaseDatasetLoader +import pandas as pd +import datasets +import os +import requests +from tqdm import tqdm + + +# TODO add a delete method +class PDFDatasetLoader(BaseDatasetLoader): + def __init__( + self, + dataset_name="common-pile/arxiv_papers", + output_dir="/mnt/data1/yuanxu4/local_dataset/arxiv", + ): + super().__init__(dataset_name=dataset_name) + if self.dataset_name == "common-pile/arxiv_papers": + try: + ds = datasets.load_dataset(self.dataset_name) + except ConnectionError as e: + if datasets.config.HF_DATASETS_OFFLINE is True: + print( + "***Dataset autodownload disabled and no dataset is found under " + f"HF_CACHE_HOME: <{datasets.config.HF_CACHE_HOME}>" + ) + raise e + self.dataset = ds["train"] + else: + raise ValueError(f"{self.dataset_name} Dataset not support.") + self.total_length = len(ds["train"]) + self.output_dir = output_dir + + def download_pdf(self, load_num): + if self.dataset_name == "common-pile/arxiv_papers": + if load_num >= self.total_length: + load_num = self.total_length + table = self.dataset + # Directory to store PDFs + if not self.output_dir: + self.output_dir = os.path.join("local_dataset", "arxiv") + os.makedirs(self.output_dir, exist_ok=True) + + # check dir already have the datasets + if len(os.listdir(self.output_dir)) >= load_num: + print("dataset already exists") + return + pbar = tqdm(total=load_num, desc="Downloading papers") + for i, example in enumerate(table): + url = example["metadata"]["url"] + if "arxiv.org/abs/" in url: + url = url.replace("arxiv.org/abs/", "arxiv.org/pdf/") + paper_id = example["id"] + filename = f"{paper_id}.pdf" + local_path = os.path.join(self.output_dir, filename) + + if not os.path.exists(local_path): + try: + response = requests.get(url, timeout=10) + if response.status_code == 200: + with open(local_path, "wb") as f: + f.write(response.content) + load_num -= 1 + pbar.update(1) + if load_num == 0: + return + else: + print(f"Failed to download {url}: status {response.status_code}") + continue + except Exception as e: + print(f"Error downloading {url}: {e}") + continue + else: + raise ValueError(f"{self.dataset_name} Dataset not support.") + + # return a dataframe with column {content: local pdf path} and {metadata: url} + def get_dataset_slice(self, length, offset): + # support Arxiv dataset on huggingface + if self.dataset_name == "common-pile/arxiv_papers": + if not self.output_dir: + self.output_dir = os.path.join("local_dataset", "arxiv") + os.makedirs(self.output_dir, exist_ok=True) + + all_files = sorted([f for f in os.listdir(self.output_dir) if f.endswith(".pdf")]) + total_len = len(all_files) + start_idx = offset * length + end_idx = (offset + 1) * length + + # check slice within range + if start_idx >= total_len: + raise ValueError(f"Slice {offset} out of range. Dataset has {total_len} samples.") + + slice_files = all_files[start_idx:end_idx] + + # Directory to store PDFs + + local_paths = [os.path.join(self.output_dir, f) for f in slice_files] + + df = pd.DataFrame({"content": local_paths}) + + print(f"Loaded {len(df)} documents from index {start_idx} to {end_idx}") + return df + else: + raise ValueError(f"{self.dataset_name} Dataset not support.") diff --git a/src/datasetLoader/TextDatasetLoader.py b/src/datasetLoader/TextDatasetLoader.py new file mode 100644 index 0000000..41dd57e --- /dev/null +++ b/src/datasetLoader/TextDatasetLoader.py @@ -0,0 +1,46 @@ +from datasetLoader.BaseDatasetLoader import BaseDatasetLoader +import pandas as pd +import datasets +from datasets import load_dataset, config, Dataset + + +class TextDatasetLoader(BaseDatasetLoader): + def __init__(self, dataset_name="wikimedia/wikipedia"): + super().__init__(dataset_name=dataset_name) + # support wiki dataset on huggingface + if dataset_name == "wikimedia/wikipedia": + try: + ds = load_dataset(dataset_name, "20231101.en") + except ConnectionError as e: + if config.HF_DATASETS_OFFLINE is True: + print( + "***Dataset autodownload disabled and no dataset is found under " + f"HF_CACHE_HOME: <{config.HF_CACHE_HOME}>" + ) + raise e + self.dataset = ds["train"] + else: + raise ValueError(f"{self.dataset_name} Dataset not support.") + self.total_length = len(self.dataset) + return + + # return a dataframe with column {content: text} and {metadata: something} + def get_dataset_slice(self, length, offset): + if self.dataset_name == "wikimedia/wikipedia": + start_idx = offset * length + end_idx = (offset + 1) * length + + # check slice within range + if start_idx >= self.total_length: + raise ValueError( + f"Slice {offset} out of range. Dataset has {self.total_length} samples." + ) + + table = self.dataset.select(range(start_idx, end_idx)) + + df = pd.DataFrame({"content": table["text"], "metadata": table["id"]}) + + print(f"Loaded {len(df)} documents from index {start_idx} to {end_idx}") + return df + else: + raise ValueError(f"{self.dataset_name} Dataset not support.") diff --git a/src/datasetLoader/__init__.py b/src/datasetLoader/__init__.py new file mode 100644 index 0000000..ca3c781 --- /dev/null +++ b/src/datasetLoader/__init__.py @@ -0,0 +1,2 @@ +# datasetLoader/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/datasetPreprocess/BaseDatasetPreprocess.py b/src/datasetPreprocess/BaseDatasetPreprocess.py new file mode 100644 index 0000000..834cf82 --- /dev/null +++ b/src/datasetPreprocess/BaseDatasetPreprocess.py @@ -0,0 +1,22 @@ +from langchain.text_splitter import RecursiveCharacterTextSplitter +from abc import ABC, abstractmethod +import pandas as pd + + +# TODO: make this to abstract class +class BaseDatasetPreprocess(ABC): + def __init__(self) -> None: + return + + # return a list ["text"], pass to embedding model to get vector + @abstractmethod + def chunking_text_to_text(self, df) -> list[str]: + pass + + @abstractmethod + def chunking_PDF_to_text(self, df) -> list[str]: + pass + + @abstractmethod + def chunking_PDF_to_image(self, df) -> list: + pass diff --git a/src/datasetPreprocess/PDFDatasetPreprocess.py b/src/datasetPreprocess/PDFDatasetPreprocess.py new file mode 100644 index 0000000..750e96c --- /dev/null +++ b/src/datasetPreprocess/PDFDatasetPreprocess.py @@ -0,0 +1,116 @@ +from datasetPreprocess.BaseDatasetPreprocess import BaseDatasetPreprocess +import torch +from docling_core.transforms.chunker import HierarchicalChunker +from docling.document_converter import DocumentConverter, PdfFormatOption +from tqdm import tqdm +from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, +) +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.base_models import InputFormat +import os +from pdf2image import convert_from_path + + +class PDFDatasetPreprocess(BaseDatasetPreprocess): + def __init__(self): + super().__init__() + + def convert_PDF_to_text(self, df): + # using docling as document converting and chunking + # Check if GPU or MPS is available + + accelerator_options = AcceleratorOptions( + num_threads=8, + device=AcceleratorDevice.CUDA if torch.cuda.is_available() else AcceleratorDevice.CPU, + ) + pipeline_options = PdfPipelineOptions() + pipeline_options.accelerator_options = accelerator_options + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + docs = [] + for path in tqdm(df["content"], desc="convert pdf data"): + # Convert the input file to Docling Document + doc = converter.convert(path).document + docs.append(doc) + return docs + + def chunking_PDF_to_text(self, docs): + # converter = DocumentConverter() + chunker = HierarchicalChunker() + chunked_texts = [] + for doc in tqdm(docs, desc="chunking pdf data"): + # Perform hierarchical chunking + texts = [chunk.text for chunk in chunker.chunk(doc)] + chunked_texts.extend(texts) + + total_chunks_num = len(chunked_texts) + print(f"Total chunks to process: {total_chunks_num}.") + return chunked_texts + + def batch_chunking_PDF_to_text(self, df, batch_size=8): + # Check if GPU or MPS is available + accelerator_options = AcceleratorOptions( + num_threads=8, + device=AcceleratorDevice.CUDA if torch.cuda.is_available() else AcceleratorDevice.CPU, + ) + pipeline_options = PdfPipelineOptions() + pipeline_options.accelerator_options = accelerator_options + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + # converter = DocumentConverter() + chunker = HierarchicalChunker() + chunked_texts = [] + input_doc_paths = [] + for path in df["content"]: + input_doc_paths.append(path) + + # Convert the input file to Docling Document + docs = converter.convert_all(input_doc_paths) + # Perform hierarchical chunking + for doc in tqdm(docs, desc="chunking pdf data"): + texts = [chunk.text for chunk in chunker.chunk(doc.document)] + chunked_texts.extend(texts) + + total_chunks_num = len(chunked_texts) + print(f"Total chunks to process: {total_chunks_num}.") + return chunked_texts + + def chunking_PDF_to_image(self, df): + saved_pages = [] + for path in tqdm(df["content"], desc=f"convert pdf to image"): + if path.lower().endswith(".pdf"): + images = convert_from_path(path) + pdf_base = os.path.splitext(os.path.basename(path))[0] + pdf_dir = os.path.dirname(path) + pages_dir = os.path.join(pdf_dir, "pages") + if not os.path.exists(pages_dir): + os.makedirs(pages_dir) + for i, image in enumerate(images): + out_path = os.path.join(pages_dir, f"{pdf_base}_page_{i+1}.png") + image.save(out_path, "PNG") + saved_pages.append(out_path) + # print(f"Saved {out_path}") + + return saved_pages + + def chunking_text_to_text(self): + return diff --git a/src/datasetPreprocess/TextDatasetPreprocess.py b/src/datasetPreprocess/TextDatasetPreprocess.py new file mode 100644 index 0000000..11ad9a7 --- /dev/null +++ b/src/datasetPreprocess/TextDatasetPreprocess.py @@ -0,0 +1,28 @@ +from datasetPreprocess.BaseDatasetPreprocess import BaseDatasetPreprocess +from langchain.text_splitter import RecursiveCharacterTextSplitter + + +class TextDatasetPreprocess(BaseDatasetPreprocess): + def __init__(self, chunk_size=512, chunk_overlap=0.1): + super().__init__() + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + # TODO add more chunking stratgy + def chunking_text_to_text(self, df): + chunked_texts = [] + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap + ) + for text in df["content"]: + chunks = text_splitter.split_text(text) + chunked_texts.extend(chunks) + total_chunks_num = len(chunked_texts) + print(f"Total chunks to process: {total_chunks_num}.") + return chunked_texts + + def chunking_PDF_to_image(self): + return + + def chunking_PDF_to_text(self): + return diff --git a/src/datasetPreprocess/__init__.py b/src/datasetPreprocess/__init__.py new file mode 100644 index 0000000..0f49ce7 --- /dev/null +++ b/src/datasetPreprocess/__init__.py @@ -0,0 +1,2 @@ +# datasetPreprocess/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/encoder/BaseEncoder.py b/src/encoder/BaseEncoder.py new file mode 100644 index 0000000..aac7f13 --- /dev/null +++ b/src/encoder/BaseEncoder.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod +from sentence_transformers import SentenceTransformer +import numpy as np +import time + + +# TODO make this to abstactmethods +class BaseEncoder(ABC): + def __init__() -> None: + pass + + @abstractmethod + def load_encoder(self) -> None: + pass + + @abstractmethod + def free_encoder(self) -> None: + pass + + @abstractmethod + def embedding(self, texts) -> list[np.array]: + pass + + @abstractmethod + def multi_gpus_embedding(self, texts) -> list[np.array]: + pass + + # @property + # def dataset_name(self): + # return self.__dataset_name + + # TODO add a dataset free diff --git a/src/encoder/ColPaliEncoder.py b/src/encoder/ColPaliEncoder.py new file mode 100644 index 0000000..7de1505 --- /dev/null +++ b/src/encoder/ColPaliEncoder.py @@ -0,0 +1,136 @@ +import time +import numpy as np +from encoder.BaseEncoder import BaseEncoder +import torch, gc + +import os +from typing import List, cast +from PIL import Image +from tqdm import tqdm +from torch.utils.data import DataLoader + +from colpali_engine.models import ColPali +from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor +from colpali_engine.utils.processing_utils import BaseVisualRetrieverProcessor +from colpali_engine.utils.torch_utils import ListDataset, get_torch_device + + +# TODO make this to abstactmethods +class ColPaliEncoder(BaseEncoder): + def __init__( + self, + device, + model_name, + embedding_batch_size=64, + ) -> None: + self.device = device + self.model_name = model_name + self.embedding_batch_size = embedding_batch_size + self.encoder = None + return + + def __del__(self): + self.free_encoder() + + def load_encoder(self) -> None: + + model = ColPali.from_pretrained( + self.model_name, + device_map=self.device, + ).eval() + self.dim = model.config.hidden_size + self.encoder = model + self.processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(self.model_name)) + + return + + # TODO fix this + def free_encoder(self) -> None: + if self.encoder is not None: + del self.encoder + self.encoder = None + if self.processor: + del self.processor + self.processor = None + + torch.cuda.synchronize() + gc.collect() + torch.cuda.empty_cache() + try: + torch.cuda.ipc_collect() + except Exception: + pass + return + + def embedding(self, pages): + + images = [Image.open(name) for name in pages] + + dataloader = DataLoader( + dataset=ListDataset[str](images), + batch_size=1, + shuffle=False, + collate_fn=lambda x: self.processor.process_images(x), + ) + + ds: List[torch.Tensor] = [] + for batch_doc in tqdm(dataloader, "embedding pdf's images"): + with torch.no_grad(): + batch_doc = {k: v.to(self.encoder.device) for k, v in batch_doc.items()} + embeddings_doc = self.encoder(**batch_doc) + ds.extend(list(torch.unbind(embeddings_doc.to("cpu")))) + + filepaths = [name for name in pages] + data = [] + for i in range(len(filepaths)): + data.append( + { + "colbert_vecs": ds[i].float().numpy(), + "doc_id": i, + "filepath": filepaths[i], + } + ) + + dict_list = [] + for pdf in tqdm(data, "insert pdf's image"): + # Insert ColBERT embeddings and metadata for a document into the collection. + colbert_vecs = [vec for vec in pdf["colbert_vecs"]] + seq_length = len(colbert_vecs) + doc_ids = [pdf["doc_id"] for i in range(seq_length)] + seq_ids = list(range(seq_length)) + dict_list.extend( + [ + { + "vector": colbert_vecs[i], + "seq_id": seq_ids[i], + "doc_id": doc_ids[i], + "filepath": pdf["filepath"], + } + for i in range(seq_length) + ] + ) + return dict_list + + def embedding_query(self, queries) -> List[torch.Tensor]: + dataloader = DataLoader( + dataset=ListDataset[str](queries), + batch_size=1, + shuffle=False, + collate_fn=lambda x: self.processor.process_queries(x), + ) + + qs: List[torch.Tensor] = [] + for batch_query in dataloader: + with torch.no_grad(): + batch_query = {k: v.to(self.encoder.device) for k, v in batch_query.items()} + embeddings_query = self.encoder(**batch_query) + qs.extend(list(torch.unbind(embeddings_query.to("cpu")))) + return qs + + # @property + # def dataset_name(self): + # return self.__dataset_name + + # TODO add a dataset free + def multi_gpus_embedding(self, texts): + pass diff --git a/src/encoder/__init__.py b/src/encoder/__init__.py new file mode 100644 index 0000000..cc6f969 --- /dev/null +++ b/src/encoder/__init__.py @@ -0,0 +1,2 @@ +# encoder/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/encoder/sentenceTransformerEncoder.py b/src/encoder/sentenceTransformerEncoder.py new file mode 100644 index 0000000..a79e502 --- /dev/null +++ b/src/encoder/sentenceTransformerEncoder.py @@ -0,0 +1,86 @@ +import time +import numpy as np +from sentence_transformers import SentenceTransformer +from encoder.BaseEncoder import BaseEncoder +import torch, gc + + +# TODO make this to abstactmethods +class SentenceTransformerEncoder(BaseEncoder): + def __init__( + self, + device, + sentence_transformers_name, + embedding_batch_size=64, + ) -> None: + self.device = device + self.sentence_transformers_name = sentence_transformers_name + self.embedding_batch_size = embedding_batch_size + self.encoder = None + return + + def __del__(self): + self.free_encoder() + + def load_encoder(self) -> None: + self.encoder = SentenceTransformer( + self.sentence_transformers_name, + self.device, + model_kwargs={"torch_dtype": "float16"}, + ) + self.dim = self.encoder.get_sentence_embedding_dimension() + print( + f"***Loaded encoder: {self.sentence_transformers_name}\n" + f"***Embedding Dim: {self.dim}\n" + f"***Max Seq Length: {self.encoder.get_max_seq_length()}" + ) + return + + # TODO fix this + def free_encoder(self) -> None: + if self.encoder is not None: + del self.encoder + self.encoder = None + torch.cuda.synchronize() + gc.collect() + torch.cuda.empty_cache() + try: + torch.cuda.ipc_collect() + except Exception: + pass + return + + def embedding(self, texts) -> list[np.array]: + embeddings = self.encoder.encode( + texts, batch_size=self.embedding_batch_size, show_progress_bar=True + ) + + embeddings = np.vstack(embeddings) + embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True) + return embeddings.tolist() + + def multi_gpus_embedding(self, texts) -> list[np.array]: + embeddings_start_time = time.time() + print(f"***All dataset Embeddings start") + pool = self.encoder.start_multi_process_pool(self.device) + num_process = len(pool["processes"]) + print(f"***{num_process} processes been create, start embedding") + + embeddings = self.encoder.encode_multi_process( + texts, pool, show_progress_bar=True, batch_size=self.embedding_batch_size + ) + + embeddings = np.vstack(embeddings) + embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True) + self.encoder.stop_multi_process_pool(pool) + embeddings_end_time = time.time() + print(f"***Embeddings shape: {embeddings.shape}") + print(f"***All dataset Embeddings end :time :{embeddings_end_time - embeddings_start_time}") + + return embeddings + + # @property + # def dataset_name(self): + # return self.__dataset_name + + # TODO add a dataset free diff --git a/src/evaluator/BaseEvaluator.py b/src/evaluator/BaseEvaluator.py new file mode 100644 index 0000000..e881206 --- /dev/null +++ b/src/evaluator/BaseEvaluator.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod + + +class BaseEvaluator(ABC): + def __init__(self, dataset_name: str) -> None: + self.__dataset_name = dataset_name + + def __init__(self) -> None: + pass + + @abstractmethod + def evaluate_single( + self, question: str, answer: str, contexts: list[str], ground_truth: str + ) -> None: + pass + + @abstractmethod + def evaluate_dataset(self, dataset) -> None: + pass diff --git a/src/evaluator/README.md b/src/evaluator/README.md new file mode 100644 index 0000000..0d65b26 --- /dev/null +++ b/src/evaluator/README.md @@ -0,0 +1,30 @@ +### Evaluator + +Examples of how to use it is documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md) + +This module provides evaluation tools for Retrieval-Augmented Generation (RAG) using **local language models**. It wraps local LLMs (e.g., Qwen2-7B-Instruct-GPTQ-Int8) to work with [RAGAS](https://github.com/explodinggradients/ragas) and enables metrics like `context_recall`, `faithfulness`, `answer_relevancy`, and `context_precision`. + +#### Install Git LFS + +> Required to download large model weights. + +**Ubuntu/Debian:** +```bash +sudo apt install git-lfs +``` + +#### Usage + +> Download LLM model and embedding model (e.g. Qwen2-7B-Instruct-GPTQ-Int8, bge-large-zh-v1.5) +``` +git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct-GPTQ-Int8 +cd Qwen2-7B-Instruct-GPTQ-Int8 +git lfs pull +git clone https://huggingface.co/BAAI/bge-large-zh-v1.5 +cd bge-large-zh-v1.5 +git lfs pull +``` +> Set the LLM and embedding model path in config file `wiki_evaluatein`, Run the RAG system +``` +python3 src/run.py --config config/wiki_evaluate.yaml +``` \ No newline at end of file diff --git a/src/evaluator/RagasEvaluator.py b/src/evaluator/RagasEvaluator.py new file mode 100644 index 0000000..678471a --- /dev/null +++ b/src/evaluator/RagasEvaluator.py @@ -0,0 +1,162 @@ +from typing import List, Optional, Any +from datasets import Dataset +from ragas.metrics import faithfulness, context_recall, context_precision, answer_relevancy +from ragas import evaluate +from ragas.llms import LangchainLLMWrapper +from ragas.embeddings import BaseRagasEmbeddings +from ragas.run_config import RunConfig +from FlagEmbedding import FlagModel +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig +from langchain.llms.base import LLM +from langchain.callbacks.manager import CallbackManagerForLLMRun +import asyncio +from evaluator.BaseEvaluator import BaseEvaluator + + +class MyLLM(LLM): + tokenizer: AutoTokenizer = None + model: AutoModelForCausalLM = None + + def __init__(self, mode_name_or_path: str): + super().__init__() + self.tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path) + self.model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, device_map="auto") + self.model.generation_config = GenerationConfig.from_pretrained(mode_name_or_path) + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> str: + messages = [{"role": "user", "content": prompt}] + input_ids = self.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + model_inputs = self.tokenizer([input_ids], return_tensors="pt").to('cuda') + generated_ids = self.model.generate(model_inputs.input_ids, max_new_tokens=4096) + generated_ids = [ + output_ids[len(input_ids) :] + for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + return response + + @property + def _llm_type(self): + return "local_llm" + + +class MyEmbedding(BaseRagasEmbeddings): + + def __init__(self, path, run_config, max_length=512, batch_size=256): + self.model = FlagModel( + path, + query_instruction_for_retrieval="Generate a representation for this sentence to retrieve related articles: ", + ) + self.max_length = max_length + self.batch_size = batch_size + self.run_config = run_config + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist() + + def embed_query(self, text: str) -> List[float]: + return self.model.encode_queries(text, self.batch_size, self.max_length).tolist() + + async def aembed_documents(self, texts: List[str]) -> List[List[float]]: + return await asyncio.to_thread(self.embed_documents, texts) + + async def aembed_query(self, text: str) -> List[float]: + return await asyncio.to_thread(self.embed_query, text) + + def close(self): + try: + self.model.stop_self_pool() + except Exception as e: + print(f"Warning during embedder cleanup: {e}") + + +# run_config = RunConfig(timeout=800, max_wait=800) +class RagasEvaluator(BaseEvaluator): + def __init__(self, llm_path, emb_path): + self.run_config = RunConfig(timeout=800, max_wait=800) + self.embedding_model = MyEmbedding(emb_path, self.run_config) + self.my_llm = LangchainLLMWrapper(MyLLM(llm_path), self.run_config) + + def evaluate_single(self, question: str, answer: str, contexts: List[str], ground_truth: str): + + dataset = Dataset.from_dict( + { + 'question': [question], + 'answer': [answer], + 'contexts': [contexts], + 'ground_truth': [ground_truth], + } + ) + print( + { + 'question': [question], + 'answer': [answer], + 'contexts': [contexts], + 'ground_truth': [ground_truth], + } + ) + + result = evaluate( + dataset, + metrics=[context_recall, context_precision, answer_relevancy, faithfulness], + llm=self.my_llm, + embeddings=self.embedding_model, + run_config=self.run_config, + ) + + df = result.to_pandas() + print(df.head()) + df.to_csv("evaluate_result.csv", index=False) + loop = asyncio.get_event_loop() + loop.close() + return + + def evaluate_dataset(self, dataset): + result = evaluate( + dataset, + metrics=[context_recall, context_precision, answer_relevancy, faithfulness], + llm=self.my_llm, + embeddings=self.embedding_model, + run_config=self.run_config, + ) + print("*" * 50) + print(result) + print("*" * 50) + df = result.to_pandas() + print(df.head()) + df.to_csv("evaluate_result.csv", index=False) + loop = asyncio.get_event_loop() + loop.close() + return + + +# data_samples = { +# 'question': [ +# 'When was the first Super Bowl?', +# 'Who won the most Super Bowls?' +# ], +# 'answer': [ +# 'The first Super Bowl was held on Jan 15, 1967', +# 'The most Super Bowls have been won by The New England Patriots' +# ], +# 'contexts': [ +# [ +# 'The first AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles, California.'], +# [ +# 'The New England Patriots have won the Super Bowl a record six times, surpassing the Pittsburgh Steelers who have won it six times as well.'] +# ], +# 'ground_truth': [ +# 'The first Super Bowl was held on January 15, 1967', +# 'The New England Patriots have won the Super Bowl a record six times' +# ] +# } + +# dataset = Dataset.from_dict(data_samples) diff --git a/src/evaluator/RagasOpenAI.py b/src/evaluator/RagasOpenAI.py new file mode 100644 index 0000000..84b188c --- /dev/null +++ b/src/evaluator/RagasOpenAI.py @@ -0,0 +1,108 @@ +from typing import List +from datasets import Dataset +from ragas.metrics import faithfulness, context_recall, context_precision, answer_relevancy +from ragas import evaluate +from ragas.llms import LangchainLLMWrapper +from ragas.run_config import RunConfig +import asyncio +from evaluator.BaseEvaluator import BaseEvaluator + +from langchain_openai import ChatOpenAI +from ragas.embeddings import OpenAIEmbeddings +import openai +from ragas import EvaluationDataset +import os + +os.environ["OPENAI_API_KEY"] = "your_openai_api_key_here" # Replace with your actual OpenAI API key + + +# run_config = RunConfig(timeout=800, max_wait=800) +class RagasOpenAI(BaseEvaluator): + def __init__(self, llm_path, emb_path): + self.llm = ChatOpenAI(model="gpt-4o") + self.openai_client = openai.OpenAI() + self.embeddings = OpenAIEmbeddings(client=self.openai_client) + self.run_config = RunConfig( + timeout=120, # Adjust the timeout as needed + max_retries=15, # Increase the number of retries + max_wait=90, # Adjust the maximum wait time + log_tenacity=True, # Enable logging for retry attempts + ) + + def evaluate_single(self, question: str, answer: str, contexts: List[str], ground_truth: str): + + dataset = Dataset.from_dict( + { + 'question': [question], + 'answer': [answer], + 'contexts': [contexts], + 'ground_truth': [ground_truth], + } + ) + print( + { + 'question': [question], + 'answer': [answer], + 'contexts': [contexts], + 'ground_truth': [ground_truth], + } + ) + + result = evaluate( + dataset, + metrics=[context_recall, context_precision, answer_relevancy, faithfulness], + llm=self.my_llm, + embeddings=self.embedding_model, + run_config=self.run_config, + ) + + df = result.to_pandas() + print(df.head()) + df.to_csv("evaluate_result.csv", index=False) + loop = asyncio.get_event_loop() + loop.close() + return + + def evaluate_dataset(self, dataset): + evaluation_dataset = EvaluationDataset.from_list(dataset) + # evaluator_llm = LangchainLLMWrapper(self.llm) + result = evaluate( + evaluation_dataset, + metrics=[context_recall, context_precision, answer_relevancy, faithfulness], + llm=self.llm, + embeddings=self.embeddings, + run_config=self.run_config, + ) + print("*" * 50) + print(result) + print("*" * 50) + df = result.to_pandas() + print(df.head()) + df.to_csv("evaluate_result.csv", index=False) + loop = asyncio.get_event_loop() + loop.close() + return + + +# data_samples = { +# 'question': [ +# 'When was the first Super Bowl?', +# 'Who won the most Super Bowls?' +# ], +# 'answer': [ +# 'The first Super Bowl was held on Jan 15, 1967', +# 'The most Super Bowls have been won by The New England Patriots' +# ], +# 'contexts': [ +# [ +# 'The first AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles, California.'], +# [ +# 'The New England Patriots have won the Super Bowl a record six times, surpassing the Pittsburgh Steelers who have won it six times as well.'] +# ], +# 'ground_truth': [ +# 'The first Super Bowl was held on January 15, 1967', +# 'The New England Patriots have won the Super Bowl a record six times' +# ] +# } + +# dataset = Dataset.from_dict(data_samples) diff --git a/src/evaluator/Ragasvllm.py b/src/evaluator/Ragasvllm.py new file mode 100644 index 0000000..d2f8ad9 --- /dev/null +++ b/src/evaluator/Ragasvllm.py @@ -0,0 +1,262 @@ +import os + +os.environ["OUTLINES_CACHE_DIR"] = "vllm_cache" +os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + +import typing as t +import torch, gc + +from ragas.metrics import LLMContextPrecisionWithReference, LLMContextRecall +from ragas import evaluate, EvaluationDataset +from vllm import AsyncLLMEngine, LLM, SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs + + +import uuid + +from langchain_core.callbacks import Callbacks +from langchain_core.outputs import LLMResult, Generation +from langchain_core.prompt_values import PromptValue +from ragas.cache import CacheInterface +from ragas.llms import BaseRagasLLM +from ragas.run_config import RunConfig + +from evaluator.BaseEvaluator import BaseEvaluator +from datasets import Dataset +from typing import List, Optional, Any +from ragas.metrics import ( + LLMContextRecall, + Faithfulness, + FactualCorrectness, + AnswerAccuracy, + BleuScore, +) +import asyncio + + +class vLLMWrapper(BaseRagasLLM): + """ + A wrapper class that adapts vLLM's inference engine to the Ragas-compatible BaseRagasLLM interface. + + This class enables using vLLM for scoring and evaluation tasks within the Ragas framework by implementing + the `generate_text` and `agenerate_text` method that produces LangChain-compatible `LLMResult` objects. + Source: https://github.com/explodinggradients/ragas/blob/main/ragas/src/ragas/llms/base.py#L123 + + Attributes: + llm: The vLLM model instance, typically created via `vllm.LLM(...)`. + sampling_params: A `SamplingParams` object defining temperature, top_p, etc. + run_config: Optional configuration for controlling how evaluations are executed. + cache: Optional cache for storing/reusing model outputs. + + """ + + def __init__( + self, + vllm_model, + sampling_params, + run_config: t.Optional[RunConfig] = None, + cache: t.Optional[CacheInterface] = None, + ): + super().__init__(cache=cache) + self.llm = vllm_model + self.sampling_params = sampling_params + + if run_config is None: # legacy code + run_config = RunConfig() + self.set_run_config(run_config) + + def is_finished(self, response: LLMResult) -> bool: + """ + Verify that generation finished correctly by looking at finish_reason. + `response` contains the n outputs of a single input, thus: + len(response.generations) == 1 + len(response.generations[0]) == n + """ + is_finished_list = [] + for single_generation in response.generations[0]: + # generation_info is provided with `finish_reason` + finish_reason = single_generation.generation_info.get("finish_reason") + is_finished_list.append(finish_reason == 'stop') + + # if all the n outputs finished correctly, return True + return all(is_finished_list) + + def generate_text( + self, + prompt: PromptValue, + n: int = 1, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = None, + ) -> LLMResult: + temperature = None + stop = None + callbacks = None + + prompt = prompt.to_string() + self.sampling_params.n = n + + vllm_result = self.llm.generate(prompt, self.sampling_params)[0] + + generations = [ + [ + Generation( + text=output.text.strip(), + generation_info={'finish_reason': output.finish_reason}, + ) + for output in vllm_result.outputs + ] + ] + ragas_expected_result = LLMResult(generations=generations) + + return ragas_expected_result + + async def agenerate_text( + self, + prompt: PromptValue, + n: int = 1, + temperature: t.Optional[float] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = None, + ) -> LLMResult: + temperature = None + stop = None + callbacks = None + + prompt = prompt.to_string() + self.sampling_params.n = n + request_id = str(uuid.uuid4()) + results_generator = self.llm.generate(prompt, self.sampling_params, request_id=request_id) + vllm_result = None + async for request_output in results_generator: + vllm_result = request_output + generations = [ + [ + Generation( + text=output.text.strip(), + generation_info={'finish_reason': output.finish_reason}, + ) + for output in vllm_result.outputs + ] + ] + ragas_expected_result = LLMResult(generations=generations) + + return ragas_expected_result + + def set_run_config(self, run_config: RunConfig): + self.run_config = run_config + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(llm={self.llm.__class__.__name__}(...))" + + +# run_config = RunConfig(timeout=800, max_wait=800) +class Ragasvllm(BaseEvaluator): + def __init__(self, llm_path="Qwen/Qwen2.5-7B-Instruct"): + self.run_config = RunConfig(timeout=800, max_wait=800) + # self.embedding_model = MyEmbedding(emb_path, self.run_config) + self.llm_name = llm_path + + self.sampling_params = SamplingParams( + temperature=0.6, + top_p=0.9, + max_tokens=8096, + ) + + def load_evaluator_model(self): + # todo set device from config + self.llm: AsyncLLMEngine = AsyncLLMEngine.from_engine_args( + AsyncEngineArgs( + model=self.llm_name, + task='generate', # generation task + enforce_eager=True, + # device=self.device, + dtype=torch.bfloat16, + trust_remote_code=True, + gpu_memory_utilization=0.8, + max_model_len=8192, + tensor_parallel_size=2, + ) + ) + + async def free_evaluator_model(self): + if self.llm is not None: + # self.llm.shutdown_background_loop() + del self.llm + self.llm = None + try: + import torch.distributed as dist + + if dist.is_available() and dist.is_initialized(): + dist.destroy_process_group() + except Exception: + pass + + gc.collect() + torch.cuda.empty_cache() + try: + torch.cuda.ipc_collect() + except Exception: + pass + + def evaluate_single(self, question: str, answer: str, contexts: List[str], ground_truth: str): + + dataset = Dataset.from_dict( + { + 'question': [question], + 'answer': [answer], + 'contexts': [contexts], + 'ground_truth': [ground_truth], + } + ) + print( + { + 'question': [question], + 'answer': [answer], + 'contexts': [contexts], + 'ground_truth': [ground_truth], + } + ) + + result = evaluate( + dataset, + # metrics=[FactualCorrectness(), AnswerAccuracy(), BleuScore()], + metrics=[FactualCorrectness(), AnswerAccuracy(), LLMContextRecall()], + llm=vLLMWrapper(self.llm, self.sampling_params), + # embeddings=self.embedding_model, + run_config=self.run_config, + ) + + df = result.to_pandas() + print(df.head()) + df.to_csv("evaluate_result.csv", index=False) + asyncio.get_event_loop().close() + return + + def evaluate_dataset(self, dataset): + + self.load_evaluator_model() + try: + result = evaluate( + dataset, + metrics=[FactualCorrectness(), AnswerAccuracy(), LLMContextRecall()], + llm=vLLMWrapper(self.llm, self.sampling_params), + # embeddings=self.embedding_model, + run_config=self.run_config, + ) + print("*" * 50) + print(result) + print("*" * 50) + df = result.to_pandas() + print(df.head()) + df.to_csv("evaluate_result.csv", index=False) + + finally: + # IMPORTANT: free the async engine INSIDE the loop + # delegate to an async helper and run it with asyncio.run + asyncio.run(self._async_free()) + return + + async def _async_free(self): + # your async free (with await self.llm.aclose(), etc.) + await self.free_evaluator_model() diff --git a/src/evaluator/__init__.py b/src/evaluator/__init__.py new file mode 100644 index 0000000..9058064 --- /dev/null +++ b/src/evaluator/__init__.py @@ -0,0 +1,2 @@ +# evaluator/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/monitoring_sys/__init__.py b/src/monitoring_sys/__init__.py new file mode 100644 index 0000000..5709d2f --- /dev/null +++ b/src/monitoring_sys/__init__.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import sys +from unittest import mock + +# NOTE: Supports for generating helps for flags is useful, and +# absl.app.define_help_flags provides that functionality. But in the process of +# importing absl.app, it will try to import absl.logging also. Importing +# absl.logging is not desired as it have more than expected compared to its c++ +# counterpart (e.g., defining more flags are are not desired). Also, this +# library is using a customized logging scheme based on python logging module, +# so absl.logging should not be imported. We void the absl.logging module before +# importing the absl.app.define_help_flags to avoid undesired behavior. +with mock.patch.dict("sys.modules", {"absl.logging": mock.Mock()}): + from absl import flags as abflags + from absl.app import define_help_flags + +from utils.logger import logging, Logger + +comp_logger = Logger().register_component("MSys", auto_readable=False) + +# real imports +import io +import re +import json +import types +import typing +import utils.colored_print as cprint +import monitoring_sys.libmsys as lms +from utils.python_utils import SupportsReadStr + +# fuse c++ side interface into this module +from monitoring_sys.libmsys import * + +from monitoring_sys.config_parser.msys_config_parser import MSysConfig + + +class MSys: + @staticmethod + def from_config_dict(config_dict: dict) -> MSys: + return MSys(MSysConfig.from_config_dict(config_dict)) + + @staticmethod + def from_yaml_file(fp: SupportsReadStr) -> MSys: + return MSys(MSysConfig.from_yaml_file(fp)) + + @staticmethod + def from_yaml_string(yaml_string: str) -> MSys: + return MSys(MSysConfig.from_yaml_file(io.StringIO(yaml_string))) + + @staticmethod + def from_msys_config(msys_config: MSysConfig) -> MSys: + return MSys(msys_config) + + def __init__(self, msys_config: MSysConfig): + self.__msys_config = msys_config + self.__msys_id = lms.getMonitoringSystem(**msys_config.init_config) + self.__msys_add_meter_functions = self.msys_add_monitor_functions + + for meter_property in msys_config.meter_configs: + assert isinstance(meter_property, dict) + meter_type = meter_property.pop("type", None) + assert meter_type is not None, "Each meter must have a 'type' field" + + add_meter_func = self.__msys_add_meter_functions.get(meter_type, None) + assert add_meter_func is not None, ( + f"Unknown meter type: {meter_type}, " + f"available meters: {list(self.__msys_add_meter_functions.keys())}" + ) + ret = add_meter_func(self.__msys_id, **meter_property) + assert ( + ret + ), f"Failed to add meter: {meter_type} with properties: {json.dumps(meter_property)}" + + def test_run(self) -> bool: + return lms.testRun(self.__msys_id) + + def report_status(self, verbose: bool = False, detail: bool = False) -> None: + lms.reportStatus(self.__msys_id, verbose, detail) + + def start_recording(self) -> bool: + return lms.startRecording(self.__msys_id) + + def stop_recording(self) -> bool: + return lms.stopRecording(self.__msys_id) + + def __enter__(self): + assert self.start_recording(), "[MSys] Failed to start recording" + return self + + def __exit__(self, exctype, value, tb): + ret = self.stop_recording() + if not ret: + comp_logger.log(logging.ERROR, "[MSys] Failed to stop recording") + if exctype is not None: + comp_logger.log(logging.ERROR, f"Exception occurred in MSys monitor region") + return False + return True + + @property + def msys_add_monitor_functions(self) -> dict[str, typing.Callable]: + monitor_func_pattern = r"^add(.*)ToSystem$" + return { + regex_match.group(1): getattr(lms, func_name) + for func_name in dir(lms) + if ( + isinstance(getattr(lms, func_name), types.BuiltinFunctionType) + and (regex_match := re.match(monitor_func_pattern, func_name)) + ) + } + + +# === code to run after importing module === +# call absl helper to show all defined flags for all the imported modules +define_help_flags() +abflags.FLAGS.unparse_flags() + +# filter argv and parse only known flags via absl +known_flags = set(abflags.FLAGS) +filtered_argv = [sys.argv[0]] # keep program name +for arg in sys.argv[1:]: + if arg.startswith("--"): + key = arg.split("=")[0][2:] # remove '--' + if key in known_flags: + filtered_argv.append(arg) + else: + filtered_argv.append(arg) +abflags.FLAGS(filtered_argv) + +# initialize the monitoring system properly +assert initialize(Logger().log_dirpath), "[MSys] Initialization failed" diff --git a/src/monitoring_sys/config_parser/msys_config_parser.py b/src/monitoring_sys/config_parser/msys_config_parser.py new file mode 100644 index 0000000..4603e2c --- /dev/null +++ b/src/monitoring_sys/config_parser/msys_config_parser.py @@ -0,0 +1,252 @@ +from __future__ import annotations +from utils.logger import logging, Logger + +comp_logger = Logger().register_component(__file__, name_level=0) + +import io +import re +import os +import yaml +import json +import copy +from typing import TYPE_CHECKING, Any, Callable, Final, Sequence, Type, TypeVar +from utils.python_utils import SupportsReadStr + + +class MSysConfig: + @classmethod + def from_config_dict(cls, config_dict: dict[str, Any]) -> MSysConfig: + return cls(config_dict) + + @classmethod + def from_yaml_file(cls, fp: SupportsReadStr) -> MSysConfig: + config_dict = yaml.safe_load(fp) + assert isinstance(config_dict, dict), "Invalid YAML format, expect input to be a dict" + return cls(config_dict) + + @classmethod + def from_yaml_string(cls, yaml_string: str) -> MSysConfig: + config_dict = yaml.safe_load(yaml_string) + assert isinstance(config_dict, dict), "Invalid YAML format, expect input to be a dict" + return cls(config_dict) + + def __init__(self, config_dict: dict[str, Any]): + msys_config = config_dict.get("MSys", None) + assert msys_config is not None, "Config file must have 'MSys' field" + self.__config: dict[str, Any] = copy.deepcopy(msys_config) + + msys_init_config = self.__config.get("system", None) + assert msys_init_config is not None, "Config file must have 'MSys.system' field" + self.__init_config = msys_init_config + + msys_meter_configs = self.__config.get("meter", None) + assert msys_meter_configs is not None, "Config file must have 'MSys.meter' field" + self.__meter_configs = msys_meter_configs + + for meter_config in self.__meter_configs: + assert isinstance(meter_config, dict), "Each meter config must be a dict" + + @property + def init_config(self) -> dict[str, Any]: + return copy.deepcopy(self.__init_config) + + @property + def meter_configs(self) -> list[dict[str, Any]]: + return copy.deepcopy(self.__meter_configs) + + def add_init_config(self, new_configs: dict[str, Any]) -> MSysConfig: + self.__init_config.update(new_configs) + return self + + +class StaticEnv: + def __init__(self): + self.__static_env: dict[str, str | list[str]] = {} + + def add_env(self, env: dict[str, str | list[str]] | list[dict[str, str | list[str]]]) -> None: + if isinstance(env, dict): + self.__static_env.update(env) + elif isinstance(env, list): + for item in env: + if isinstance(item, dict): + self.__static_env.update(item) + + def get_env(self, env_name: str) -> str | list[str] | None: + return self.__static_env.get(env_name, None) + + def disp_env(self) -> None: + print(json.dumps(self.__static_env, indent=2)) + + __envs: dict[str, StaticEnv] = {} + + @classmethod + def get_static_env(cls, env_name: str) -> StaticEnv: + if env_name not in cls.__envs: + cls.__envs[env_name] = StaticEnv() + return cls.__envs[env_name] + + +class MacroTranslator: + def __init__(self, env: StaticEnv): + self.__env = env + self.__pattern = r"\${{\s*([^\s]+)\s*}}" + self.__list_pattern = r"^(\s*)\${{\s*-\s*([^\s]+).*$" + + T = TypeVar("T", str, SupportsReadStr) + + def translate(self, text: T) -> T: + def replace_macro(match: re.Match) -> str: + macro_name = match.group(1) + replacement = self.__env.get_env(macro_name) + if replacement is None: + comp_logger.log( + logging.WARNING, + f"String macro {match.group(0).strip()} is not defined in environment", + ) + return match.group(0) + if not isinstance(replacement, str): + comp_logger.log( + logging.WARNING, + f"String macro {match.group(0).strip()} expansion expects a string " + f"but found a {type(replacement).__name__}", + ) + return match.group(0) + return replacement + + def replace_list_macro(match: re.Match) -> str: + prefix = match.group(1) + macro_name = match.group(2) + replacement = self.__env.get_env(macro_name) + if replacement is None: + comp_logger.log( + logging.WARNING, + f"List macro {match.group(0).strip()} is not defined in environment", + ) + return match.group(0) + if not isinstance(replacement, list): + comp_logger.log( + logging.WARNING, + f"List macro {match.group(0).strip()} expansion expects a list " + f"but found a {type(replacement).__name__}", + ) + return match.group(0) + return "\n".join(f"{prefix}- {item}" for item in replacement) + + if isinstance(text, str): + return re.sub( + self.__pattern, + replace_macro, + re.sub(self.__list_pattern, replace_list_macro, text, flags=re.MULTILINE), + ) + elif isinstance(text, SupportsReadStr): + return io.StringIO( + re.sub( + self.__pattern, + replace_macro, + re.sub( + self.__list_pattern, replace_list_macro, text.read(), flags=re.MULTILINE + ), + ) + ) + else: + assert False, "Unsupported type for translation" + + +# === Initialize Global StaticEnv === +# TODO: Change these to be more user friendly in the future +global_env = StaticEnv.get_static_env("global") +# get all fields in MemMetadata.Probe and put them into StaticEnv +from proto.mem_metrics_pb2 import MemMetadata + +global_env.add_env( + { + f"mem_mon.probe.{name}": str(val.number) + for name, val in MemMetadata.Probe.DESCRIPTOR.values_by_name.items() + } +) + +from proto.gpu_metrics_pb2 import GPUMetadata + +# get all fields in GPUMetadata.[NVML|GPM]Probe and put them into StaticEnv +global_env.add_env( + [ + { + f"gpu_mon.probe.{name}": str(val.number) + for name, val in GPUMetadata.NVMLProbe.DESCRIPTOR.values_by_name.items() + }, + { + f"gpu_mon.probe.{name}": str(val.number) + for name, val in GPUMetadata.GPMProbe.DESCRIPTOR.values_by_name.items() + }, + ] +) + +from proto.proc_metrics_pb2 import ProcMetadata + +# get all fields in ProcMetadata.Probe and put them into StaticEnv +global_env.add_env( + { + f"proc_mon.probe.{name}": str(val.number) + for name, val in ProcMetadata.Probe.DESCRIPTOR.values_by_name.items() + } +) + + +import pynvml + +pynvml.nvmlInit() +global_env.add_env( + { + f"gpus.all_gpus": [str(idx) for idx in range(pynvml.nvmlDeviceGetCount())], + } +) + +from monitoring_sys.config_parser.resource_identifier.this_process import ThisProcess +import utils.python_utils as pyutils + +src_dev = pyutils.find_device_for_path(os.path.abspath(__file__)) +src_dev2 = pyutils.find_device_for_path(os.path.abspath("/mnt/data1")) +print(f"Source file device: {src_dev}, /mnt/data1 device: {src_dev2}") +assert src_dev is not None, "Cannot find device for current source file" +global_env.add_env( + { + "this_process.pids": [str(pid) for pid in ThisProcess().get_process_pids()], + "this_process.used_disks": [src_dev, src_dev2], + } +) + +from monitoring_sys.config_parser.resource_identifier.vdb_base import DockerComposeClient +from monitoring_sys.config_parser.resource_identifier.vdb_milvus import MilvusDockerCompose + +# Vector DB +vdbs_pids: list[str] = [] +vdbs_used_disks: list[str] = [] +try: + # milvus + milvus_config_path = DockerComposeClient.query_active_docker_compose_config_path( + "milvus-standalone" + ) + milvus_docker_compose = MilvusDockerCompose(milvus_config_path) + + # processes + milvus_pids = [str(pid) for pid in milvus_docker_compose.get_process_pids()] + vdbs_pids.extend(milvus_pids) + global_env.add_env({"vdb.milvus.pids": milvus_pids}) + + # used disks + assert milvus_config_path is not None + milvus_used_disk = pyutils.find_device_for_path(milvus_config_path) + assert milvus_used_disk is not None + milvus_used_disks = [milvus_used_disk] + global_env.add_env({"vdb.milvus.used_disks": milvus_used_disks}) + vdbs_used_disks.extend(milvus_used_disks) +except Exception: + pass +global_env.add_env({"vdbs.pids": vdbs_pids}) +global_env.add_env({"vdbs.used_disks": vdbs_used_disks}) + +global_env.add_env( + { + "pylogger.log_dirpath": Logger().log_dirpath, + } +) diff --git a/src/monitoring_sys/config_parser/resource_identifier/base.py b/src/monitoring_sys/config_parser/resource_identifier/base.py new file mode 100644 index 0000000..ee9e792 --- /dev/null +++ b/src/monitoring_sys/config_parser/resource_identifier/base.py @@ -0,0 +1,24 @@ +from utils.logger import logging, Logger + +comp_logger = Logger().register_component(__file__, name_level=1) + +import abc +import utils.decorator as deco +from typing import Any + + +class MonitoredProc(abc.ABC): + @abc.abstractmethod + def __init__(self): + pass + + @abc.abstractmethod + def get_process_with_desc(self) -> dict[int, dict[str, Any]]: + pass + + @abc.abstractmethod + def get_process_pids(self) -> set[int]: + pass + + def pids(self) -> set[int]: + return self.get_process_pids() diff --git a/src/monitoring_sys/config_parser/resource_identifier/this_process.py b/src/monitoring_sys/config_parser/resource_identifier/this_process.py new file mode 100644 index 0000000..2b275df --- /dev/null +++ b/src/monitoring_sys/config_parser/resource_identifier/this_process.py @@ -0,0 +1,19 @@ +from utils.logger import logging, Logger + +comp_logger = Logger().register_component(__file__, name_level=1) + +import os + +from monitoring_sys.config_parser.resource_identifier import base + + +class ThisProcess(base.MonitoredProc): + def __init__(self, description: str = "ThisProcess") -> None: + self.__pid = os.getpid() + self.__description = description + + def get_process_with_desc(self) -> dict[int, dict[str, str]]: + return {self.__pid: {"Desc": self.__description}} + + def get_process_pids(self) -> set[int]: + return {self.__pid} diff --git a/src/monitoring_sys/config_parser/resource_identifier/vdb_base.py b/src/monitoring_sys/config_parser/resource_identifier/vdb_base.py new file mode 100644 index 0000000..fe6ca45 --- /dev/null +++ b/src/monitoring_sys/config_parser/resource_identifier/vdb_base.py @@ -0,0 +1,95 @@ +from __future__ import annotations +from utils.logger import logging, Logger + +comp_logger = Logger().register_component(__file__, name_level=1) + +import os +import abc +import subprocess +import textwrap +import json +from typing import Any + +from monitoring_sys.config_parser.resource_identifier import base + + +class VDBMonitoredProc(base.MonitoredProc): + pass + + +class DockerComposeClient: + def __init__(self, config_path) -> None: + assert os.path.exists(config_path), comp_logger.get_augmented_message( + f"Path {config_path} does not exist" + ) + if os.path.isdir(config_path): + config_path = os.path.join(config_path, "docker-compose.yml") + assert os.path.isfile(config_path), comp_logger.get_augmented_message( + f"Docker compose config does not exist in {config_path}" + ) + self.__docker_compose_config_path = os.path.abspath(config_path) + + @staticmethod + def required_fields(): + return {"Service", "ID", "PID"} + + @staticmethod + def query_active_docker_compose_config_path(name: str) -> str | None: + p = subprocess.Popen( + [ + "docker", + "inspect", + "--format", + r"""'{{ index .Config.Labels "com.docker.compose.project.config_files" }}'""", + name, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + pret_stdout, pret_stderr = [msg.decode().strip() for msg in p.communicate()] + docker_compose_config_path = pret_stdout.strip("'") + + if p.returncode != 0: + comp_logger.log( + logging.WARNING, + f"Docker inspect on container with name {name} failed " + f"with retcode {p.returncode} and message :\n" + f"{textwrap.indent(pret_stderr, ' ')}", + ) + return None + return os.path.dirname(docker_compose_config_path) + + def get_service_descs(self): + p = subprocess.Popen( + [ + "docker", + "compose", + "-f", + self.__docker_compose_config_path, + "ps", + "--format=json", + "--status=running", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + pret_stdout, pret_stderr = [msg.decode().strip() for msg in p.communicate()] + + assert p.returncode == 0, comp_logger.get_augmented_message( + f"Docker compose failed in with retcode {p.returncode} and message :\n" + + textwrap.indent(pret_stderr, ' ') + ) + + container_descs = {} + for desc in (json.loads(chunk) for chunk in pret_stdout.split('\n')): + p = subprocess.Popen( + ["docker", "inspect", "-f", r"{{.State.Pid}}", desc["ID"]], stdout=subprocess.PIPE + ) + pid_str, _ = p.communicate() + assert p.returncode == 0 + desc["PID"] = int(pid_str.strip()) + + container_descs[desc["ID"]] = { + field: desc[field] for field in DockerComposeClient.required_fields() + } + return container_descs diff --git a/src/monitoring_sys/config_parser/resource_identifier/vdb_milvus.py b/src/monitoring_sys/config_parser/resource_identifier/vdb_milvus.py new file mode 100644 index 0000000..14efaa1 --- /dev/null +++ b/src/monitoring_sys/config_parser/resource_identifier/vdb_milvus.py @@ -0,0 +1,40 @@ +from utils.logger import logging, Logger + +comp_logger = Logger().register_component(__file__, name_level=1) + +from monitoring_sys.config_parser.resource_identifier import vdb_base + + +class MilvusDockerCompose(vdb_base.VDBMonitoredProc): + @staticmethod + def required_services(): + return {"etcd", "minio", "standalone"} + + def __init__(self, docker_compose_path): + super().__init__() + + self.__docker_compose_inst = vdb_base.DockerComposeClient(docker_compose_path) + self.__service_descs = self.__docker_compose_inst.get_service_descs() + service_counts = { + service_name: 0 for service_name in MilvusDockerCompose.required_services() + } + + for _, service_info in self.__service_descs.items(): + service_name = service_info["Service"] + if service_name in service_counts: + service_counts[service_name] += 1 + + invalid_services = { + service_name: service_count + for service_name, service_count in service_counts.items() + if service_count != 1 + } + assert len(invalid_services) == 0, comp_logger.get_augmented_message( + f"Milvus docker compose find invalid services {invalid_services}, service count should be one" + ) + + def get_process_with_desc(self): + return {desc["PID"]: desc["Service"] for desc in self.__service_descs.values()} + + def get_process_pids(self) -> set[int]: + return {desc["PID"] for desc in self.__service_descs.values()} diff --git a/src/monitoring_sys/config_parser/resource_identifier/vdbs.py b/src/monitoring_sys/config_parser/resource_identifier/vdbs.py new file mode 100644 index 0000000..e69de29 diff --git a/src/multimodal/PDFPipeline.py b/src/multimodal/PDFPipeline.py new file mode 100644 index 0000000..9e75c4e --- /dev/null +++ b/src/multimodal/PDFPipeline.py @@ -0,0 +1,372 @@ +from pdf2image import convert_from_path +import matplotlib.pyplot as plt + +from colpali_engine.models import ColPali +from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor +from colpali_engine.utils.processing_utils import BaseVisualRetrieverProcessor +from colpali_engine.utils.torch_utils import ListDataset, get_torch_device +from src.vectordb.milvus_api import milvus_client +from PIL import Image +from torch.utils.data import DataLoader +import torch +from typing import List, cast +import os +from tqdm import tqdm +import concurrent.futures +import numpy as np +import re +from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor +from qwen_vl_utils import process_vision_info + + +def display_pdf_images(images_list): + """Display all images in the provided list as subplots with 5 images per row.""" + num_images = len(images_list) + num_rows = num_images // 5 + (1 if num_images % 5 > 0 else 0) + fig, axes = plt.subplots(num_rows, 5, figsize=(20, 4 * num_rows)) + axes = axes.flatten() + for i, img in enumerate(images_list): + if i < len(axes): + ax = axes[i] + ax.imshow(img) + ax.set_title(f"Page {i+1}") + ax.axis('off') + for j in range(num_images, len(axes)): + axes[j].axis('off') + plt.tight_layout() + plt.show() + + +class PDFDatasetProcessor: + def __init__(self, encoder_model_name="vidore/colpali-v1.2", collection_name="PDF_1"): + self.encoder_model_name = encoder_model_name + self.collection_name = collection_name + self.pages_dir = None + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self._setup_encoder() + self._setup_db() + + def _setup_encoder(self): + """Initialize ColPali encoder.""" + device = get_torch_device("cuda") + + model = ColPali.from_pretrained( + self.encoder_model_name, + dtype=torch.bfloat16, + device_map=device, + ).eval() + + self.encoder = model + self.processor = cast( + ColPaliProcessor, ColPaliProcessor.from_pretrained(self.encoder_model_name) + ) + + def _setup_db(self): + # set db + self.db_client = milvus_client( + db_path="http://localhost:19530", + db_token="root:Milvus", + collection_name=self.collection_name, + drop_previous_collection=True, + # dim=config["sys"]["vector_db"]["dim"], + index_type="HNSW", + metric_type="IP", + ) + + self.db_client.setup() + + # input: pdf path + # output: none + # save png file into pages/ dir + def PDFtoimage(self, pdf_path): + """ + Convert all PDF files in pdf_path to images. + Saves each page as '{pdf_path}/pages/{pdf_filename}_page_{i+1}.png'. + """ + self.pages_dir = os.path.join(pdf_path, "pages/") + + if os.path.exists(self.pages_dir): + print(f"'{self.pages_dir}' directory already exists. Skipping PDF to image conversion.") + return + + os.makedirs(self.pages_dir, exist_ok=True) + + for filename in os.listdir(pdf_path): + if filename.lower().endswith(".pdf"): + path = os.path.join(pdf_path, filename) + images = convert_from_path(path) + + pdf_base = os.path.splitext(filename)[0] + for i, image in enumerate(images): + out_path = os.path.join(self.pages_dir, f"{pdf_base}_page_{i+1}.png") + image.save(out_path, "PNG") + print(f"Saved {out_path}") + + def PDFembedding(self): + """Generate embeddings for each page in PDF. + + Returns: + List[ dict{"colbert_vecs", "doc_id", "filepath"}] + """ + + if self.pages_dir == None: + print(f"self.pages_dir' directory have not create. Run PDFtoimage(pdf_path) first.") + return + + images = [Image.open(self.pages_dir + name) for name in os.listdir(self.pages_dir)] + + dataloader = DataLoader( + dataset=ListDataset[str](images), + batch_size=1, + shuffle=False, + collate_fn=lambda x: self.processor.process_images(x), + ) + + ds: List[torch.Tensor] = [] + for batch_doc in tqdm(dataloader, "embedding pdf's images"): + with torch.no_grad(): + batch_doc = {k: v.to(self.encoder.device) for k, v in batch_doc.items()} + embeddings_doc = self.encoder(**batch_doc) + ds.extend(list(torch.unbind(embeddings_doc.to("cpu")))) + + filepaths = [self.pages_dir + name for name in os.listdir(self.pages_dir)] + data = [] + for i in range(len(filepaths)): + data.append( + { + "colbert_vecs": ds[i].float().numpy(), + "doc_id": i, + "filepath": filepaths[i], + } + ) + return data + + def PDFinsert(self, data): + dict_list = [] + for pdf in tqdm(data, "insert pdf's image"): + # Insert ColBERT embeddings and metadata for a document into the collection. + colbert_vecs = [vec for vec in pdf["colbert_vecs"]] + seq_length = len(colbert_vecs) + doc_ids = [pdf["doc_id"] for i in range(seq_length)] + seq_ids = list(range(seq_length)) + dict_list.extend( + [ + { + "vector": colbert_vecs[i], + "seq_id": seq_ids[i], + "doc_id": doc_ids[i], + "filepath": pdf["filepath"], + } + for i in range(seq_length) + ] + ) + self.db_client.insert_data( + dict_list, collection_name=self.collection_name, create_collection=True + ) + + +class PDFRagPipeline: + def __init__(self, pdf_dir, encoder_model_name="vidore/colpali-v1.2", collection_name="PDF_1"): + self.encoder_model_name = encoder_model_name + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.collection_name = collection_name + self.pages_dir = os.path.join(pdf_dir, "pages/") + self._setup_db() + self._setup_encoder() + + def _setup_db(self): + # set db + self.db_client = milvus_client( + db_path="http://localhost:19530", + db_token="root:Milvus", + collection_name=self.collection_name, + drop_previous_collection=True, + # dim=config["sys"]["vector_db"]["dim"], + index_type="HNSW", + metric_type="IP", + ) + + self.db_client.setup() + + def _setup_encoder(self): + """Initialize ColPali encoder.""" + device = get_torch_device("cuda") + + model = ColPali.from_pretrained( + self.encoder_model_name, + dtype=torch.bfloat16, + device_map=device, + ).eval() + + self.encoder = model + self.processor = cast( + ColPaliProcessor, ColPaliProcessor.from_pretrained(self.encoder_model_name) + ) + + def _setup_llm(self): + self.vl_model = Qwen2VLForConditionalGeneration.from_pretrained( + "Qwen/Qwen2-VL-7B-Instruct", + dtype=torch.bfloat16, + ) + self.vl_model.cuda().eval() + min_pixels = 224 * 224 + max_pixels = 1024 * 1024 + self.vl_model_processor = Qwen2VLProcessor.from_pretrained( + "Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels + ) + return + + def QueriesEmbedding(self, queries): + """Generate embeddings for queries. + + Returns: + List[vector] + """ + dataloader = DataLoader( + dataset=ListDataset[str](queries), + batch_size=1, + shuffle=False, + collate_fn=lambda x: self.processor.process_queries(x), + ) + + qs: List[torch.Tensor] = [] + for batch_query in dataloader: + with torch.no_grad(): + batch_query = {k: v.to(self.encoder.device) for k, v in batch_query.items()} + embeddings_query = self.encoder(**batch_query) + qs.extend(list(torch.unbind(embeddings_query.to("cpu")))) + print(qs[0]) + return qs + + def PDFsearch(self, embeddings, top_k=1): + # Perform a vector search on the collection to find the top-k most similar documents. + # topk set to a reasonable large num + # results = self.db_client.query_search(embeddings, topk=50, collection_name=self.collection_name, output_fields=["vector", "seq_id", "doc_id", "filepath"]) + # search_params = {"metric_type": "IP", "params": {}} + results = self.db_client.search( + collection_name=self.collection_name, + data=embeddings, + limit=int(50), + output_fields=["vector", "seq_id", "doc_id", "filepath"], + # search_params=search_params, + ) + + print(f"len results: {len(results)}") + # get unique doc_id from db search + doc_ids = set() + for r_id in range(len(results)): + for r in range(len(results[r_id])): + doc_ids.add(results[r_id][r]["entity"]["doc_id"]) + + scores = [] + + def rerank_single_doc(doc_id, data, client, collection_name): + # Rerank a single document by retrieving its embeddings and calculating the similarity with the query. + doc_colbert_vecs = client.db_query( + collection_name=collection_name, + filter=f"doc_id in [{doc_id}]", + output_fields=["seq_id", "vector", "filepath"], + limit=1000, + ) + doc_vecs = np.vstack( + [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))] + ) + score = np.dot(data, doc_vecs.T).max(1).sum() + return (score, doc_id, doc_colbert_vecs[0]["filepath"]) + + with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor: + futures = { + executor.submit( + rerank_single_doc, doc_id, embeddings, self.db_client, self.collection_name + ): doc_id + for doc_id in doc_ids + } + for future in concurrent.futures.as_completed(futures): + score, doc_id, filepath = future.result() + scores.append((score, doc_id, filepath)) + + scores.sort(key=lambda x: x[0], reverse=True) + if len(scores) >= top_k: + return scores[:top_k] + else: + return scores + + def GetPDF(self, filepath): + """ + Loads and returns the image at the given filepath. + """ + if os.path.exists(filepath): + image = Image.open(filepath) + return image + else: + print(f"File does not exist: {filepath}") + return None + + def PDFquery(self, query, top_k=1, max_new_tokens=500): + """High-level interface for querying PDF pages.""" + + # load llm model + self._setup_llm() + + # query search + results = self.PDFsearch(query, top_k) + images_list = [] + for hits in results: + images_list.append(self.GetPDF(hits[2])) + chat_template = [ + { + "role": "user", + "content": [{"type": "image", "image": image} for image in images_list] + + [{"type": "text", "text": query}], + } + ] + + # Prepare the inputs + text = self.vl_model_processor.apply_chat_template( + chat_template, tokenize=False, add_generation_prompt=True + ) + image_inputs, video_inputs = process_vision_info(chat_template) + inputs = self.vl_model_processor( + text=[text], + images=image_inputs, + padding=True, + return_tensors="pt", + ) + inputs = inputs.to("cuda") + + # Generate text from the vl_model + generated_ids = self.vl_model.generate(**inputs, max_new_tokens=max_new_tokens) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + + # Decode the generated text + output_text = self.vl_model_processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + print("***answer:") + print(output_text[0]) + return output_text + + +def main_test(): + + # INSERT + # DatasetProcessor = PDFDatasetProcessor(collection_name="PDF_2") + # DatasetProcessor.PDFtoimage("./pdf") + # embedding = DatasetProcessor.PDFembedding() + # DatasetProcessor.PDFinsert(embedding) + + # QUERY + RagPipeline = PDFRagPipeline("/home/yuanxu4/RAGPipeline/pdf/", collection_name="PDF_2") + + queries = ["How to end-to-end retrieval with ColBert?"] + eb = RagPipeline.QueriesEmbedding(queries) + for query in eb: + query = query.float().numpy() + RagPipeline.PDFquery(query) + return + + +if __name__ == "__main__": + main_test() diff --git a/src/multimodal/pdf_parse.py b/src/multimodal/pdf_parse.py new file mode 100644 index 0000000..0f81b37 --- /dev/null +++ b/src/multimodal/pdf_parse.py @@ -0,0 +1,114 @@ +from pypdf import PdfReader +from structured_parser import general_parser +from tqdm import tqdm +import os +import sys +import time +import copy +import numpy as np + + +PDF_PARSER_CONFIG_DEFAULT = { + "text_chunk_size": 512, + "chunk_overlap": 0.1, + "figure_extraction": True, + "figure_caption": False, + "table_extraction": False, + "table_caption": False, +} + + +class rag_pdf_parser(general_parser): + def __init__(self): + self.type = "PDF" + self.config = PDF_PARSER_CONFIG_DEFAULT.copy() + self.fig_store = None + self.text_chunks = [] + + def __init__(self, pdf_dir_path, config=None): + self.pdf_dir_path = pdf_dir_path + self.config = config if config else PDF_PARSER_CONFIG_DEFAULT.copy() + self.fig_store = None + self.text_chunks = [] + + def print_config(self): + print(f"PDF Parser Config: {self.config}") + + def set_config(self, entry, value): + if entry in self.config: + self.config[entry] = value + print(f"Config entry {entry} set to {value}.") + else: + print( + f"Invalid config entry: {entry}. Available entries are: {list(self.config.keys())}" + ) + return + + def set_fig_store(self, path): + # check if path is dir or create a new dir + if os.exists(path): + if os.path.isdir(path): + self.fig_store = path + else: + print(f"Path {path} is not a directory.") + else: + os.makedirs(path) + self.fig_store = path + print(f"Figure store set to: {self.fig_store}") + + def _parse_pdf(self, pdf_path): + reader = PdfReader(pdf_path) + for page_number, page in tqdm( + enumerate(reader.pages), desc="Processing pages in {pdf_path}" + ): + # Extract text chunks + text = page.extract_text() + if text: + # Split text into chunks + # text_chunks = self._split_text_into_chunks(text) + # self.text_chunks.extend(text_chunks) + print(f"Page {page_number + 1} Text:\n{text}\n") + + # Extract figures (images) if needed + if self.config["figure_extraction"]: + for image_index, image in enumerate(page.images): + # image_data = image.get_data() + image_path = os.path.join( + "./", f"figure_page{page_number + 1}_{image_index}.jpg" + ) + with open(image_path, "wb") as img_file: + img_file.write(image) + print(f"Saved figure from page {page_number + 1} to {image_path}") + + def parse(self, pdf_name=None): + if pdf_name is None: + # parse all pdfs in directory + for pdf_file in os.listdir(self.pdf_dir_path): + if pdf_file.endswith(".pdf"): + pdf_path = os.path.join(self.pdf_dir_path, pdf_file) + self._parse_pdf(pdf_path) + + def get_text_chunks(self): + return self.text_chunks + + def get_figures(self): + return self.figures + + +if __name__ == "__main__": + testpdf_path = "/home/shaobol2/Documents/flatflash.pdf" + + reader = PdfReader(testpdf_path) + for page_number, page in tqdm(enumerate(reader.pages), desc="Processing pages"): + # Extract text chunks + text = page.extract_text() + if text: + print(f"Page {page_number + 1} Text:\n{text}\n") + for image_index, image in enumerate(page.images): + # image_data = image.get_data() + image_path = os.path.join("./", f"figure_page{page_number + 1}_{image_index}.jpg") + with open(image_path, "wb") as img_file: + img_file.write(image) + print(f"Saved figure from page {page_number + 1} to {image_path}") + # Extract figures (images) if needed + # This part is left as a placeholder for future implementation diff --git a/src/multimodal/structured_parser.py b/src/multimodal/structured_parser.py new file mode 100644 index 0000000..75eb497 --- /dev/null +++ b/src/multimodal/structured_parser.py @@ -0,0 +1,13 @@ +class general_parser: + parser_types = ["Default", "PDF", "HTML", "JSON", "CSV"] + + def __init__(self): + self.type = "Default" + + def set_type(self, type): + if type in self.parser_types: + self.type = type + else: + raise ValueError( + f"Invalid parser type: {type}. Available types are: {self.parser_types}" + ) diff --git a/src/rag_utils/config.py b/src/rag_utils/config.py new file mode 100644 index 0000000..6d5ecd2 --- /dev/null +++ b/src/rag_utils/config.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import enum +import os +import abc +import utils.decorator as deco + +import re +import torch +from sentence_transformers import SentenceTransformer + + +class RAGProperties(metaclass=deco.Singleton): + class Type(enum.IntEnum): + MODEL_NAME = enum.auto() + CHUNK_SIZE = enum.auto() + CHUNK_OVERLAP = enum.auto() + + def __init__(self, **kwargs): + # default values + self.__properties = { + RAGProperties.Type.MODEL_NAME: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + RAGProperties.Type.CHUNK_SIZE: (chunk_size := 256), + RAGProperties.Type.CHUNK_OVERLAP: round(chunk_size * 0.10), + } + print(self.__properties) + # merge with input properties + self.__properties |= kwargs + + def get(self, type): + return self.__properties[type] + + +class Encoder: + def __new__(cls, *args, **kwargs): + raise TypeError("Cannot be instantiated") + + class Type(enum.IntEnum): + TEXT = enum.auto() + N_TYPES = enum.auto() + + @classmethod + def get(cls, type, **kwargs): + if type == Encoder.Type.TEXT: + return SentenceTransformer(RAGProperties().get(RAGProperties.Type.MODEL_NAME), **kwargs) + + +def get_db_collection_name( + name, + replacement_str="_", +): + # replace everything that is not a number, letter, or underscore with replacement_str + pattern = r"[^\w\d_]+" + occurrences = [(m.start(0), m.end(0)) for m in re.finditer(pattern, name)] + occurrences_sorted = sorted(occurrences, key=lambda inst: inst[0]) + + # look for continuous invalid strings + substring_sorted = [] + last_substring_start = 0 + for occ_start, occ_end in occurrences_sorted: + substring_sorted.append((last_substring_start, occ_start)) + last_substring_start = occ_end + substring_sorted.append((last_substring_start, len(name))) + + # replace them by ignoring them on concatenation + collection_name = name[substring_sorted[0][0] : substring_sorted[0][1]] + for inst in substring_sorted[1:]: + collection_name += replacement_str + name[inst[0] : inst[1]] + + return collection_name + + +# class VectorDB: +# def __init__(self, path: str, embedding_dim: int, **db_kwargs: dict): +# self._path = path +# self._embedding_dim = embedding_dim +# self._db_kwargs = db_kwargs + +# @abc.abstractmethod +# def open(self): +# pass + +# @abc.abstractmethod +# def close(self): +# pass diff --git a/src/rag_utils/vec_db.py b/src/rag_utils/vec_db.py new file mode 100644 index 0000000..9b3c7e7 --- /dev/null +++ b/src/rag_utils/vec_db.py @@ -0,0 +1,21 @@ +from utils.logger import logging, Logger + +comp_logger = Logger().register_component(__file__) + +import json +import re + + +class VDBConfig: + @property + def version(self): + return 0.1 + + def generate_config_file(self): + pass + + def write_to_file(self, filepath): + with open(filepath, "w") as fout: + json.dump(self.generate_config_file(), fout) + + pass diff --git a/src/run_new.py b/src/run_new.py new file mode 100644 index 0000000..866a209 --- /dev/null +++ b/src/run_new.py @@ -0,0 +1,419 @@ +def main(): + import os, sys + import utils.python_utils as pyutils + import time + + if not any([p in arg for p in ["--log_dir", "--create_log_dir"] for arg in sys.argv]): + sys.argv.append(f"--log_dir={os.path.join(pyutils.get_script_dir(__file__), 'output')}") + sys.argv.append(f"--create_log_dir=True") + + from utils.logger import logging, Logger, log_time_breakdown, save_config_to_log_dir + + from config import load_config, get_db_collection_name + from utils.python_utils import get_by_path + import utils.colored_print as cprint + + # put those before any other imports to prevent loading wrong libstdc++.so + from monitoring_sys.config_parser.msys_config_parser import StaticEnv, MacroTranslator + from monitoring_sys import MSys + from monitoring_sys.config_parser.msys_config_parser import MSysConfig + + import torch + import argparse + import pickle + import _pickle as cPickle + + from vectordb.milvus_api import milvus_client + from vectordb.lancedb_api import lance_client + from vectordb.qdrant_api import qdrant_client + from vectordb.chroma_api import chroma_client + from vectordb.elastic_api import elastic_client + + from datasetLoader.TextDatasetLoader import TextDatasetLoader + from datasetPreprocess.TextDatasetPreprocess import TextDatasetPreprocess + from datasetLoader.PDFDatasetLoader import PDFDatasetLoader + + # from datasetPreprocess.PDFDatasetPreprocess import PDFDatasetPreprocess + + from RAGRequest.TextsRAGRequest import WikipediaRequests + from RAGPipeline.TextsRAGPipline import TextsRAGPipeline + from RAGPipeline.ImageRAGPipline import ImagesRAGPipeline + from RAGPipeline.retriever.BaseRetriever import BaseRetriever + from RAGPipeline.reranker.CrossEncoderReranker import CrossEncoderReranker + from RAGPipeline.responser.TextsResponser import VLLMResponser + from RAGPipeline.responser.ImagesResponser import ImageResponser + + from encoder.sentenceTransformerEncoder import SentenceTransformerEncoder + from encoder.ColPaliEncoder import ColPaliEncoder + from evaluator.RagasEvaluator import RagasEvaluator + from evaluator.RagasOpenAI import RagasOpenAI + from evaluator.Ragasvllm import Ragasvllm + + # avoid warning about TOKENIZERS_PARALLELISM + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + output_path = Logger().log_dirpath + cprint.iprintf(f"Using output path: {output_path}") + + # parse arguments + parser = argparse.ArgumentParser() + parser.add_argument("--config", type=str, help="Path to the configuration file") + parser.add_argument( + "--msys-config", type=str, help="Path to the monitoring system configuration file" + ) + # parser.add_argument("-d", "--dry_run", action="store_true", help="Run in dry run mode, no actual processing") + args = parser.parse_known_args()[0] + if not args.config: + raise ValueError("Please provide a configuration file using --config") + config = load_config(args.config) + + if not args.msys_config: + raise ValueError( + "Please provide a monitoring system configuration file using --msys-config" + ) + with open(args.msys_config, "r") as fin: + translated_config = ( + MacroTranslator(StaticEnv.get_static_env("global")).translate(fin).read() + ) + with open(os.path.join(output_path, "translated_msys_config.yaml"), "w") as fout: + fout.write(translated_config) + monitor = MSys(MSysConfig.from_yaml_string(translated_config)) + monitor.report_status(verbose=False, detail=True) + + # set collection name + if not config['sys']['vector_db']['collection_name'] == '': + collection_name = get_db_collection_name(config['sys']['vector_db']['collection_name']) + else: + collection_name = get_db_collection_name(f"{config['run_name']}") + cprint.iprintf(f"*** Start the run with collection {collection_name}") + + # set db + if config["sys"]["vector_db"]["type"] == "milvus": + db_client = milvus_client( + db_path=config["sys"]["vector_db"]["db_path"], + db_token=config["sys"]["vector_db"]["db_token"], + collection_name=collection_name, + drop_previous_collection=config["sys"]["vector_db"]["drop_previous_collection"], + # dim=config["sys"]["vector_db"]["dim"], + index_type=config["rag"]["build_index"]["index_type"], + metric_type=config["rag"]["build_index"]["metric_type"], + ) + elif config["sys"]["vector_db"]["type"] == "lancedb": + db_client = lance_client( + db_path=config["sys"]["vector_db"]["db_path"], + collection_name=collection_name, + # dim=config["sys"]["vector_db"]["dim"], + index_type=config["rag"]["build_index"]["index_type"], + metric_type=config["rag"]["build_index"]["metric_type"], + drop_previous_collection=config["sys"]["vector_db"]["drop_previous_collection"], + ) + elif config["sys"]["vector_db"]["type"] == "qdrant": + db_client = qdrant_client( + db_path=config["sys"]["vector_db"]["db_path"], + collection_name=collection_name, + # dim=config["sys"]["vector_db"]["dim"], + index_type=config["rag"]["build_index"]["index_type"], + metric_type=config["rag"]["build_index"]["metric_type"], + drop_previous_collection=config["sys"]["vector_db"]["drop_previous_collection"], + ) + elif config["sys"]["vector_db"]["type"] == "chroma": + db_client = chroma_client( + db_path=config["sys"]["vector_db"]["db_path"], + collection_name=collection_name, + # dim=config["sys"]["vector_db"]["dim"], + index_type=config["rag"]["build_index"]["index_type"], + metric_type=config["rag"]["build_index"]["metric_type"], + drop_previous_collection=config["sys"]["vector_db"]["drop_previous_collection"], + ) + elif config["sys"]["vector_db"]["type"] == "elasticsearch": + db_client = elastic_client( + db_path=config["sys"]["vector_db"]["db_path"], + collection_name=collection_name, + # dim=config["sys"]["vector_db"]["dim"], + index_type=config["rag"]["build_index"]["index_type"], + metric_type=config["rag"]["build_index"]["metric_type"], + drop_previous_collection=config["sys"]["vector_db"]["drop_previous_collection"], + ) + else: + raise ValueError(f"Unsupported vector database type: {config['sys']['vector_db']['type']}") + + db_client.setup() + cprint.iprintf(f"*** Vector DB setup done") + + # prepare workload + dataset_name = config["bench"]["dataset"] + save_config_to_log_dir(args.config) + # for image RAG + if config["bench"]["type"] == "image": + pass + # preprocess dataset + with monitor: + if config["rag"]["action"]["preprocess"]: + log_time_breakdown("start") + if dataset_name == "common-pile/arxiv_papers": + cprint.iprintf( + f"*** Start loading dataset: {dataset_name}, time : {time.monotonic_ns()} " + ) + dataset_ratio = config["bench"]["preprocessing"]["dataset_ratio"] + loader = PDFDatasetLoader(dataset_name=dataset_name) + samples_length = int(loader.total_length * dataset_ratio) + loader.download_pdf(load_num=samples_length) + df = loader.get_dataset_slice(length=samples_length, offset=0) + cprint.iprintf( + f"*** Done Loaded dataset: {dataset_name}, total samples: {len(df)}, done" + ) + log_time_breakdown("chunking") + chunker = PDFDatasetPreprocess() + pages = chunker.chunking_PDF_to_image(df) + + # embedding + if config["rag"]["action"]["embedding"]: + cprint.iprintf(f"*** Start embedding images, time : {time.monotonic_ns()}") + log_time_breakdown("embed") + embedder = ColPaliEncoder( + device="cuda:0", + model_name=config["rag"]["embedding"]["sentence_transformers_name"], + embedding_batch_size=config["rag"]["embedding"]["batch_size"], + ) + embedder.load_encoder() + dict_list = embedder.embedding(pages) + embedder.free_encoder() + print( + f"***Embedding done, total {len(dict_list)} embeddings, time : {time.monotonic_ns()}" + ) + + if config["rag"]["action"]["insert"]: + print( + f"***Start inserting embeddings into collection: {collection_name}, time : {time.monotonic_ns()}" + ) + log_time_breakdown("insert") + if config["sys"]["vector_db"]["type"] == "lancedb": + db_client.create_collection( + collection_name=collection_name, + dim=len(dict_list[0]["vector"]), + data_type="image", + ) + + db_client.insert_data( + dict_list=dict_list, + collection_name=collection_name, + insert_batch_size=config["rag"]["insert"]["batch_size"], + create_collection=True, + ) + print( + f"***Insertion done, total {len(dict_list)} embeddings inserted, time : {time.monotonic_ns()}" + ) + log_time_breakdown("done") + if config["rag"]["action"]["generation"] == True: + RAGRequest = WikipediaRequests( + run_name=config["run_name"], + collection_name=collection_name, + req_type="query", + req_count=config["rag"]["retrieval"]["question_num"], + ) + print(f"***End request preparation") + + # prepare pipeline + retriever = BaseRetriever( + collection_name=collection_name, + top_k=config["rag"]["retrieval"]["top_k"], + retrieval_batch_size=config["rag"]["retrieval"]["retrieval_batch_size"], + client=db_client, + ) + responser = ImageResponser( + model=config["rag"]["generation"]["model"], + device=config["rag"]["generation"]["device"], + ) + embedder = ColPaliEncoder( + device="cuda:0", + model_name=config["rag"]["embedding"]["sentence_transformers_name"], + embedding_batch_size=config["rag"]["embedding"]["batch_size"], + ) + RAGPipline = ImagesRAGPipeline( + retriever=retriever, + responser=responser, + embedder=embedder, + ) + + # pipeline.check() + import utils.colored_print as cprint + + with monitor: + RAGPipline.process( + RAGRequest, + batch_size=config["rag"]["pipeline"]["batch_size"], + ) + + return + elif config["bench"]["type"] == "text": + # preprocess dataset + if config["rag"]["action"]["preprocess"]: + # if True: + log_time_breakdown("start") + with monitor: + # TODO: add length and offset into config + # download and load dataset + # if config["rag"]["action"]["preprocess"]: + if dataset_name == "wikimedia/wikipedia": + dataset_ratio = config["bench"]["preprocessing"]["dataset_ratio"] + loader = TextDatasetLoader(dataset_name=dataset_name) + samples_length = int(loader.total_length * dataset_ratio) + df = loader.get_dataset_slice(length=samples_length, offset=0) + cprint.iprintf( + f"*** Done Loaded dataset: {dataset_name}, total samples: {len(df)}, done" + ) + elif dataset_name == "common-pile/arxiv_papers": + dataset_ratio = config["bench"]["preprocessing"]["dataset_ratio"] + loader = PDFDatasetLoader(dataset_name=dataset_name) + samples_length = int(loader.total_length * dataset_ratio) + loader.download_pdf(load_num=samples_length) + df = loader.get_dataset_slice(length=samples_length, offset=0) + cprint.iprintf( + f"*** Done Loaded dataset: {dataset_name}, total samples: {len(df)}, done" + ) + # chunking datasets + if dataset_name == "wikimedia/wikipedia": + chunker = TextDatasetPreprocess( + chunk_size=config["bench"]["preprocessing"]["chunk_size"], + chunk_overlap=config["bench"]["preprocessing"]["chunk_overlap"], + ) + log_time_breakdown("chunking") + chunked_texts = chunker.chunking_text_to_text(df) + cprint.iprintf(f"*** Chunking done, total {len(chunked_texts)} chunks") + elif dataset_name == "common-pile/arxiv_papers": + chunker = PDFDatasetPreprocess() + log_time_breakdown("convert") # todo separate chunking and converting + docs = chunker.convert_PDF_to_text(df) + log_time_breakdown("chunking") + chunked_texts = chunker.chunking_PDF_to_text(docs) + cprint.iprintf(f"*** Chunking done, total {len(chunked_texts)} chunks") + + embeddings_dim = None + # embedding + if config["rag"]["action"]["embedding"]: + cprint.iprintf(f"*** Start embedding texts") + log_time_breakdown("embed") + embedder = SentenceTransformerEncoder( + device="cuda:0", + sentence_transformers_name=config["rag"]["embedding"][ + "sentence_transformers_name" + ], + embedding_batch_size=config["rag"]["embedding"]["batch_size"], + ) + embedder.load_encoder() + embeddings_dim = embedder.dim + embeddings = embedder.embedding(chunked_texts) + embedder.free_encoder() + print(f"***Embedding done, total {len(embeddings)} embeddings") + if config["rag"]["embedding"]["store"] == True: + store_path = config["rag"]["embedding"]["filepath"] + # Store data + os.makedirs(os.path.dirname(store_path), exist_ok=True) + with open(store_path, 'wb') as handle: + pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL) + + if config["rag"]["embedding"]["load"] == True: + log_time_breakdown("load") + load_path = config["rag"]["embedding"]["filepath"] + with open(load_path, 'rb') as handle: + embeddings = cPickle.load(handle) + print(f"***Embedding loaded, total {len(embeddings)} embeddings") + # print(f"***Embedding example0: {embeddings[0]['vector']}") + # print(f"***Embedding example0: {embeddings[0]}") + # print(f"***Embedding dim: {len(embeddings[0]['vector'])}") + embeddings_dim = len(embeddings[0]) + # chunked_texts = [emb['text'] for emb in embeddings] + # embeddings = [emb['vector'] for emb in embeddings] + # if len(embeddings) >= 7209543: + # embeddings = embeddings[:7209543] + # chunked_texts = chunked_texts[:7209543] + + # insertion + if config["rag"]["action"]["insert"]: + log_time_breakdown("insert") + print(f"***Start inserting embeddings into collection: {collection_name}") + if config["sys"]["vector_db"]["type"] == "lancedb": + db_client.create_collection( + collection_name=collection_name, dim=embeddings_dim + ) + db_client.insert_data_vector( + vector=embeddings, + chunks=chunked_texts, + collection_name=collection_name, + insert_batch_size=config["rag"]["insert"]["batch_size"], + create_collection=True, + ) + print(f"***Insertion done, total {len(embeddings)} embeddings inserted") + + # build index + if config['rag']['action']['build_index']: + log_time_breakdown("build") + db_client.build_index( + collection_name=collection_name, + index_type=config["rag"]["build_index"]["index_type"], + metric_type=config["rag"]["build_index"]["metric_type"], + # device=None, + # device=device + ) + print(f"***Indexing done for collection: {collection_name}") + log_time_breakdown("done") + # query + retrieval + reranking + generation + evaluation + if config["rag"]["action"]["generation"] == True: + RAGRequest = WikipediaRequests( + run_name=config["run_name"], + collection_name=collection_name, + req_type="query", + req_count=config["rag"]["retrieval"]["question_num"], + ) + print(f"***End request preparation") + + # prepare pipeline + retriever = BaseRetriever( + collection_name=collection_name, + top_k=config["rag"]["retrieval"]["top_k"], + retrieval_batch_size=config["rag"]["retrieval"]["retrieval_batch_size"], + client=db_client, + ) + if config['rag']['action']['reranking']: + reranker = CrossEncoderReranker( + model_name=config["rag"]["reranking"]["rerank_model"], + top_n=config["rag"]["reranking"]["top_n"], + device=config["rag"]["reranking"]["device"], + ) + else: + reranker = None + if config["rag"]["action"]["evaluate"]: + evaluator = Ragasvllm( + llm_path=config["rag"]["evaluate"]["evaluator_model"], + ) + else: + evaluator = None + responser = VLLMResponser( + model=config["rag"]["generation"]["model"], + device=config["rag"]["generation"]["device"], + parallelism=config["rag"]["generation"]["parallelism"], + ) + embedder = SentenceTransformerEncoder( + device=config["rag"]["embedding"]["device"], + sentence_transformers_name=config["rag"]["embedding"]["sentence_transformers_name"], + ) + RAGPipline = TextsRAGPipeline( + retriever=retriever, + responser=responser, + embedder=embedder, + reranker=reranker, + evaluator=evaluator, + ) + + # pipeline.check() + import utils.colored_print as cprint + + with monitor: + RAGPipline.process( + RAGRequest, + batch_size=config["rag"]["pipeline"]["batch_size"], + ) + + +if __name__ == "__main__": + main() diff --git a/src/ui_client.py b/src/ui_client.py new file mode 100644 index 0000000..fc8c63b --- /dev/null +++ b/src/ui_client.py @@ -0,0 +1,377 @@ +import streamlit as st +import yaml +import subprocess +import os +import sys +import copy +import requests +import time + +# --- Page Config --- +st.set_page_config( + page_title="RAG Panel", page_icon="🎛️", layout="wide", initial_sidebar_state="expanded" +) + +# --- 1. The Master Template (Hardcoded Defaults) --- +# This matches the structure of your uploaded qdrant_query.yaml +DEFAULT_TEMPLATE = { + "run_name": "default_run", + "bench": { + "dataset": "wikimedia/wikipedia", + "type": "text", + "preprocessing": { + "chunk_overlap": 0, + "chunk_size": 512, + "chunktype": "length", + "dataset_ratio": 0.001, + }, + }, + "sys": { + "devices": {"cpu": "cpu", "gpu_count": 2, "gpus": ["cuda:0", "cuda:1"]}, + "log": {"metrics_log": "./log/default_run.log"}, + "vector_db": { + "type": "lancedb", + "collection_name": "lance_text_full_2", + "db_path": "/mnt/data1/shaobol2/lancedb", + "db_token": "", + "drop_previous_collection": False, + }, + }, + "rag": { + "action": { + "preprocess": False, + "embedding": False, + "insert": False, + "build_index": False, + "reranking": True, + "retrieval": True, + "generation": True, + "evaluate": False, + }, + "build_index": {"index_type": "IVF_HNSW_SQ", "metric_type": "L2"}, + "embedding": { + "device": "cuda:0", + "sentence_transformers_name": "all-MiniLM-L6-v2", + "batch_size": 1024, + "embedding_framework": "sentence_transformers", + "model": "nomic-ai/nomic-embed-text-v2-moe", + "store": False, + "load": True, + "filepath": "/home/shaobol2/RAGPipeline/wiki_entire.pickle", + }, + "insert": {"batch_size": 2048, "collection_name": "", "drop_previous_collection": False}, + "generation": {"device": "cuda:0", "model": "Qwen/Qwen2.5-7B-Instruct", "parallelism": 1}, + "reranking": { + "device": "cuda:0", + "rerank_model": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "top_n": 5, + }, + "evaluate": {"evaluator_model": "Qwen/Qwen2-7B-Instruct-GPTQ-Int8"}, + "retrieval": {"question_num": 192, "retrieval_batch_size": 64, "top_k": 10}, + "pipeline": {"batch_size": 64}, + }, +} + +# Dummy MSYS config required by the script +DEFAULT_MSYS = "../config/monitor/example_config.yaml" + +# """ +# global: +# log_level: INFO +# monitor: +# target: system +# """ + +TEMP_CONFIG_PATH = "temp_ui_config.yaml" +TEMP_MSYS_PATH = "temp_msys_config.yaml" + +# --- 2. Session State Initialization --- +if "config" not in st.session_state: + st.session_state["config"] = copy.deepcopy(DEFAULT_TEMPLATE) + +config = st.session_state["config"] + +# --- 3. UI Layout --- + +st.title("🎛️ RAG Panel") + +# Use Tabs to organize the massive amount of settings +tab_main, tab_rag, tab_models, tab_sys, tab_exec = st.tabs( + ["📂 General & Data", "⚡ RAG Actions", "🧠 Models & Params", "🖥️ System & DB", "🚀 Execution"] +) + +# Settings +with tab_main: + st.subheader("Run Settings") + config['run_name'] = st.text_input("Run Name", config['run_name']) + + st.subheader("Benchmark Data") + col1, col2 = st.columns(2) + with col1: + config['bench']['dataset'] = st.text_input("Dataset Name", config['bench']['dataset']) + config['bench']['type'] = st.selectbox( + "Data Type", ["text", "image"], index=0 if config['bench']['type'] == "text" else 1 + ) + + st.write("Dataset Ratio") + + # 1. Define distinct keys for the widgets + key_slider = "ratio_slider" + key_input = "ratio_input" + + # 2. Initialize both keys in session state if missing + if key_slider not in st.session_state: + initial_val = float(config['bench']['preprocessing']['dataset_ratio']) + st.session_state[key_slider] = initial_val + st.session_state[key_input] = initial_val + + def on_slider_change(): + st.session_state[key_input] = st.session_state[key_slider] + config['bench']['preprocessing']['dataset_ratio'] = st.session_state[key_slider] + + def on_input_change(): + # When input changes, force slider to match (clamped 0-1) + val = max(0.0, min(1.0, st.session_state[key_input])) + st.session_state[key_slider] = val + config['bench']['preprocessing']['dataset_ratio'] = val + + # 4. Widgets + rc1, rc2 = st.columns([3, 1]) + with rc1: + st.slider( + "Ratio Slider", + min_value=0.0, + max_value=1.0, + step=0.001, + key=key_slider, + on_change=on_slider_change, + label_visibility="collapsed", + ) + with rc2: + st.number_input( + "Ratio Input", + min_value=0.0, + max_value=1.0, + step=0.001, + format="%.4f", + key=key_input, + on_change=on_input_change, + label_visibility="collapsed", + ) + + with col2: + config['bench']['preprocessing']['chunk_size'] = st.number_input( + "Chunk Size", value=config['bench']['preprocessing']['chunk_size'] + ) + config['bench']['preprocessing']['chunk_overlap'] = st.number_input( + "Chunk Overlap", value=config['bench']['preprocessing']['chunk_overlap'] + ) + config['bench']['preprocessing']['chunktype'] = st.text_input( + "Chunk Type", config['bench']['preprocessing']['chunktype'] + ) + + +# RAG Actions +with tab_rag: + st.subheader("Pipeline Stages") + st.info("Toggle the stages you want to execute in this run.") + + actions = config['rag']['action'] + + c1, c2, c3, c4 = st.columns(4) + with c1: + actions['preprocess'] = st.toggle("1. Preprocess", value=actions['preprocess']) + actions['embedding'] = st.toggle("2. Embedding", value=actions['embedding']) + with c2: + actions['insert'] = st.toggle("3. Insert to DB", value=actions['insert']) + actions['build_index'] = st.toggle("4. Build Index", value=actions['build_index']) + with c3: + actions['retrieval'] = st.toggle("5. Retrieval", value=actions['retrieval']) + actions['reranking'] = st.toggle("6. Reranking", value=actions['reranking']) + with c4: + actions['generation'] = st.toggle("7. Generation", value=actions['generation']) + actions['evaluate'] = st.toggle("8. Evaluate", value=actions['evaluate']) + +# --- TAB 3: Models & Parameters --- +with tab_models: + col_a, col_b = st.columns(2) + + with col_a: + st.markdown("### 🧬 Embedding") + emb = config['rag']['embedding'] + emb['sentence_transformers_name'] = st.text_input( + "Sentence Transformer", emb['sentence_transformers_name'] + ) + emb['model'] = st.text_input("Embedding Model", emb['model']) + emb['batch_size'] = st.number_input("Embed Batch Size", value=emb['batch_size']) + emb['device'] = st.text_input("Embed Device", emb['device']) + + st.caption("Storage") + emb['load'] = st.checkbox("Load Embeddings from File", value=emb['load']) + emb['store'] = st.checkbox("Store Embeddings to File", value=emb['store']) + emb['filepath'] = st.text_input("Pickle Filepath", emb['filepath']) + + st.markdown("### 🔎 Retrieval") + ret = config['rag']['retrieval'] + ret['top_k'] = st.number_input("Top K", value=ret['top_k']) + ret['question_num'] = st.number_input("Question Count", value=ret['question_num']) + ret['retrieval_batch_size'] = st.number_input( + "Retrieval Batch Size", value=ret['retrieval_batch_size'] + ) + + with col_b: + st.markdown("### 🤖 Generation (LLM)") + gen = config['rag']['generation'] + # gen['model'] = st.text_input("LLM Model ID", gen['model']) + gen['model'] = st.selectbox( + "LLM Model", + [ + "Qwen/Qwen2.5-7B-Instruct", + "mistralai/mixtral-8x7b-instruct-v0.1", + "openai/gpt-oss-20b", + "Qwen/Qwen2.5-72B-Instruct", + ], + index=( + [ + "Qwen/Qwen2.5-7B-Instruct", + "mistralai/mixtral-8x7b-instruct-v0.1", + "openai/gpt-oss-20b", + "Qwen/Qwen2.5-72B-Instruct", + ].index(gen['model']) + if gen['model'] + in [ + "Qwen/Qwen2.5-7B-Instruct", + "mistralai/mixtral-8x7b-instruct-v0.1", + "openai/gpt-oss-20b", + "Qwen/Qwen2.5-72B-Instruct", + ] + else 0 + ), + ) + gen['device'] = st.text_input("LLM Device", gen['device']) + gen['parallelism'] = st.number_input("Parallelism", value=gen['parallelism']) + + st.markdown("### ⚖️ Reranking") + rer = config['rag']['reranking'] + rer['rerank_model'] = st.text_input("Reranker Model", rer['rerank_model']) + rer['top_n'] = st.number_input("Top N Rerank", value=rer['top_n']) + + st.markdown("### 📈 Evaluation") + config['rag']['evaluate']['evaluator_model'] = st.text_input( + "Evaluator Model", config['rag']['evaluate']['evaluator_model'] + ) + +# --- TAB 4: System & DB --- +with tab_sys: + col_x, col_y = st.columns(2) + + with col_x: + st.subheader("Vector Database") + vdb = config['sys']['vector_db'] + vdb['type'] = st.selectbox( + "DB Type", + ["qdrant", "milvus", "lancedb", "chroma", "elasticsearch"], + index=( + ["qdrant", "milvus", "lancedb", "chroma", "elasticsearch"].index(vdb['type']) + if vdb['type'] in ["qdrant", "milvus", "lancedb", "chroma", "elasticsearch"] + else 0 + ), + ) + vdb['db_path'] = st.text_input("DB Path/URL", vdb['db_path']) + vdb['collection_name'] = st.text_input("Collection Name", vdb['collection_name']) + vdb['drop_previous_collection'] = st.checkbox( + "Drop Previous Collection", vdb['drop_previous_collection'] + ) + + st.caption("Index Settings") + idx = config['rag']['build_index'] + idx['index_type'] = st.text_input("Index Type", idx['index_type']) + idx['metric_type'] = st.text_input("Metric Type", idx['metric_type']) + + with col_y: + st.subheader("System Resources") + dev = config['sys']['devices'] + dev['gpu_count'] = st.number_input("GPU Count", value=dev['gpu_count']) + current_gpus = ",".join(dev['gpus']) + new_gpus = st.text_input("GPUs (comma separated)", current_gpus) + dev['gpus'] = [x.strip() for x in new_gpus.split(",") if x.strip()] + config['sys']['log']['metrics_log'] = st.text_input( + "Log File Path", config['sys']['log']['metrics_log'] + ) + +# Execution +with tab_exec: + st.subheader("🚀 Ready to Launch") + + # Save Config Helper + def save_current_config(): + with open(TEMP_CONFIG_PATH, "w") as f: + yaml.dump(config, f, default_flow_style=False) + with open(TEMP_MSYS_PATH, "w") as f: + f.write(DEFAULT_MSYS) + + col_run, col_preview = st.columns([1, 2]) + + with col_run: + if st.button("▶ START BENCHMARK", type="primary", use_container_width=True): + save_current_config() + + # Locate script + script_name = "run_new.py" + if os.path.exists(script_name): + script_path = script_name + elif os.path.exists(os.path.join("..", script_name)): + script_path = os.path.join("..", script_name) + else: + st.error(f"Cannot find {script_name}") + st.stop() + + cmd = [ + sys.executable, + "-u", + script_path, + "--config", + TEMP_CONFIG_PATH, + "--msys-config", + DEFAULT_MSYS, + ] + + st.divider() + st.write("### 📜 Live Terminal Output") + log_placeholder = st.empty() + full_logs = "" + + try: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + universal_newlines=True, + ) + + while True: + line = process.stdout.readline() + if not line and process.poll() is not None: + break + if line: + full_logs += line + # Truncate to avoid UI freezing if logs are huge + display_logs = ( + full_logs + if len(full_logs) < 10000 + else "...[older logs truncated]...\n" + full_logs[-10000:] + ) + log_placeholder.code(display_logs, language="bash") + + if process.returncode == 0: + st.success("✅ Process Finished Successfully") + else: + st.error(f"❌ Process Failed (Code {process.returncode})") + except Exception as e: + st.error(f"Execution Error: {e}") + + with col_preview: + with st.expander("📄 Review Generated YAML Config", expanded=False): + st.code(yaml.dump(config, default_flow_style=False), language="yaml") diff --git a/src/utils/colored_print.py b/src/utils/colored_print.py new file mode 100644 index 0000000..bb98ccb --- /dev/null +++ b/src/utils/colored_print.py @@ -0,0 +1,103 @@ +import os, io, logging +from enum import Enum +import utils.env_variable as env + +# respect NO_COLOR +no_color = env.no_color() + + +def color_settings(force_color: bool = False): + global no_color + no_color = force_color + + +class ANSIColors: + BLACK = "\033[30m" + RED = "\033[31m" + GREEN = "\033[32m" + YELLOW = "\033[33m" + BLUE = "\033[34m" + MAGENTA = "\033[35m" + CYAN = "\033[36m" + WHITE = "\033[37m" + ENDC = "\033[0m" + + +class MessageLevel(Enum): + EMERG = 0 + ALERT = 1 + CRIT = 2 # critical conditions + ERR = 3 # error conditions + WARNING = 4 # warning conditions + NOTICE = 5 # normal but significant condition + INFO = 6 # informational + DEBUG = 7 # debug-level messages + + +class ColoredPrintSetting: + MSG_COLOR_DICT: dict[int, str] = { + logging.CRITICAL: ANSIColors.MAGENTA, + logging.ERROR: ANSIColors.RED, + logging.WARNING: ANSIColors.YELLOW, + logging.INFO: ANSIColors.BLUE, + logging.DEBUG: ANSIColors.CYAN, + } + + +def colored_print(*args, ansi_color_str: str | ANSIColors, **kwargs) -> None: + if no_color: + print(*args, **kwargs) + else: + output_str = io.StringIO() + with io.StringIO() as output_str: + print(*args, file=output_str, end="") + print(f"{ansi_color_str}{output_str.getvalue()}{ANSIColors.ENDC}", **kwargs) + + +def cprintf(*args, **kwargs): + """Argument list same as print""" + colored_print( + *args, ansi_color_str=ColoredPrintSetting.MSG_COLOR_DICT[logging.CRITICAL], **kwargs + ) + + +def eprintf(*args, **kwargs): + """Argument list same as print""" + colored_print(*args, ansi_color_str=ColoredPrintSetting.MSG_COLOR_DICT[logging.ERROR], **kwargs) + + +def wprintf(*args, **kwargs): + """Argument list same as print""" + colored_print( + *args, ansi_color_str=ColoredPrintSetting.MSG_COLOR_DICT[logging.WARNING], **kwargs + ) + + +def iprintf(*args, **kwargs): + """Argument list same as print""" + colored_print(*args, ansi_color_str=ColoredPrintSetting.MSG_COLOR_DICT[logging.INFO], **kwargs) + + +def dprintf(*args, **kwargs): + """Argument list same as print""" + colored_print(*args, ansi_color_str=ColoredPrintSetting.MSG_COLOR_DICT[logging.DEBUG], **kwargs) + + +def __check_level(level: str | int) -> int: + # copying logging._checkLevel + if isinstance(level, int): + rv = level + elif str(level) == level: + if level not in logging._nameToLevel: + raise ValueError("Unknown level: %r" % level) + rv = logging._nameToLevel[level] + else: + raise TypeError("Level not an integer or a valid string: %r" % (level,)) + return rv + + +def lprintf(level: str | int, *args, **kwargs): + """First arg is logging level Argument list same as print""" + colored_print( + *args, ansi_color_str=ColoredPrintSetting.MSG_COLOR_DICT[__check_level(level)], **kwargs + ) diff --git a/src/utils/decorator.py b/src/utils/decorator.py new file mode 100644 index 0000000..053b35f --- /dev/null +++ b/src/utils/decorator.py @@ -0,0 +1,49 @@ +def func_property(**kwargs): + def decorate(func): + for k in kwargs: + setattr(func, k, kwargs[k]) + return func + + return decorate + + +def singleton(cls): + """ + Use this if the class itself do not need to be referenced in the future. + This will convert the class to a function that returns the singleton instance + Syntax is as follows: + ``` + @decorator.singleton + class MySingletonClass(): + ... + ``` + Reference: https://divyakhatnar.medium.com/singleton-in-python-be59f7698a51 + """ + instances = {} + + def getinstance(*args, **kwargs): + if cls not in instances: + instances[cls] = cls(*args, **kwargs) + return instances[cls] + + return getinstance + + +class Singleton(type): + """ + Use this if the class itself is still needed. + Syntax is as follows: + ``` + class MySingletonClass(metaclass=Singleton): + ... + ``` + REVIEW: this is not a decorator, it is put here for now + Reference: https://divyakhatnar.medium.com/singleton-in-python-be59f7698a51 + """ + + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super().__call__(*args, **kwargs) + return cls._instances[cls] diff --git a/src/utils/env_variable.py b/src/utils/env_variable.py new file mode 100644 index 0000000..606948d --- /dev/null +++ b/src/utils/env_variable.py @@ -0,0 +1,48 @@ +import os, functools + +IS_DEBUG_ENVIORN = "DEBUG" +NO_COLOR_ENVIORN = "NO_COLOR" + + +def check_env(env: str) -> None | str: + return os.environ.get(env, None) + + +def check_env_exists(env: str) -> bool: + return env in os.environ + + +def check_env_exists_and_not_empty(env: str) -> bool: + val = os.environ.get(env, None) + return val is not None and len(val) != 0 + + +def check_env_true(env: str) -> bool: + """ + A environment variable is considered true if the variable exists and it is + 1) an integer with non-zero value, or + 2) a non-empty string + """ + val = os.environ.get(env, None) + if val is None: + return False + is_digit = val.isdigit() + return (is_digit and int(val) != 0) or (not is_digit and len(val) != 0) + + +def set_env(env: str, val: str | int) -> None | str: + ret = os.environ.get(env, None) + if isinstance(val, int): + val = str(val) + os.environ[env] = val + return ret + + +@functools.cache +def is_debug() -> bool: + return check_env_exists(IS_DEBUG_ENVIORN) + + +@functools.cache +def no_color() -> bool: + return check_env_exists_and_not_empty(NO_COLOR_ENVIORN) diff --git a/src/utils/logger.py b/src/utils/logger.py new file mode 100644 index 0000000..d5bb1fe --- /dev/null +++ b/src/utils/logger.py @@ -0,0 +1,380 @@ +from __future__ import annotations + +import utils.decorator as deco +import utils.env_variable as env +import utils.colored_print as cprint +import shutil + +import os, sys, datetime, uuid, psutil, time +import logging + +from absl import flags as abflags + +abflags.DEFINE_string("log_dir", "log", "Path to dir that stores log files") +abflags.DEFINE_boolean( + "create_log_dir", False, "Whether to create dir if designated log_dir does not exist" +) +abflags.DEFINE_boolean( + "debug_no_logging_file", False, "Whether to disable logging to file, only print to stderr" +) + + +class LoggingCustomStreamFormatter(logging.Formatter): + def __init__(self, fmt=None, datefmt=None, style='%', validate=True, *, defaults=None): + super().__init__( + fmt=fmt, datefmt=datefmt, style=style, validate=validate, defaults=defaults + ) + self.__formats = { + level: f"{cprint.ColoredPrintSetting.MSG_COLOR_DICT[level]}{fmt}{cprint.ANSIColors.ENDC}" + for level in ( + logging.DEBUG, + logging.INFO, + logging.WARNING, + logging.ERROR, + logging.CRITICAL, + ) + } + + def format(self, record: logging.LogRecord) -> str: + if record.levelno in self.__formats: + self._style._fmt = self.__formats[record.levelno] + return super().format(record) + + +@deco.singleton +class Logger: + """ + Wrapper of a two-level hierarchical logging.Logger + """ + + def __init__(self) -> None: + # default log entry and log file time format + self.__log_time_format = r"%Y-%m-%d %H:%M:%S %z" + self.__dir_time_format = r"%Y-%m-%dT%H:%M:%S%z" + + self.__log_file_format = "[%(asctime)s](%(filename)s:%(lineno)d) %(levelname)s %(message)s" + self.__log_stderr_format = "%(levelname)s %(message)s" + + abflags.FLAGS.unparse_flags() + abflags.FLAGS(sys.argv, known_only=True) + + if not abflags.FLAGS.debug_no_logging_file: + logging_folder = os.path.realpath(abflags.FLAGS.log_dir) + + # do not create folder by default if logging_folder is provided without create_dir_if_missing + create_dir_if_missing = ( + not abflags.FLAGS["create_log_dir"].present or abflags.FLAGS.create_log_dir + ) and (not abflags.FLAGS["log_dir"].present or abflags.FLAGS.create_log_dir) + + if not os.path.isdir(logging_folder): + if not create_dir_if_missing: + cprint.eprintf( + f"Logger directory creation disabled with " + f"target directory {logging_folder} missing, abort", + file=sys.stderr, + ) + exit(1) + os.makedirs(logging_folder) + + dirname = datetime.datetime.now().astimezone().strftime(self.__dir_time_format) + + disambiguated_dirname = dirname + disambiguated_abspath = os.path.join(logging_folder, dirname) + while os.path.isdir(disambiguated_abspath): + disambiguated_dirname = f"{dirname}.{uuid.uuid4()}" + disambiguated_abspath = os.path.join(logging_folder, disambiguated_dirname) + self.__log_folder = disambiguated_dirname + self.__log_name = "python_rt.log" + self.__log_dirpath = os.path.realpath(disambiguated_abspath) + self.__log_path = os.path.realpath(os.path.join(disambiguated_abspath, self.__log_name)) + os.makedirs(os.path.join(logging_folder, self.__log_folder)) + + # set logging file handler and format + cur_root_handlers = logging.root.handlers + assert len(cur_root_handlers) == 1 + cur_root_handlers[0].setFormatter( + LoggingCustomStreamFormatter( + fmt=self.__log_stderr_format, + datefmt=self.__log_time_format, + ) + ) + + # add a file handler on top of the default stream handler + handler = logging.FileHandler( + filename=self.__log_path, + mode="w", + delay=True, + ) + handler.setFormatter( + logging.Formatter( + fmt=self.__log_file_format, + datefmt=self.__log_time_format, + ) + ) + logging.root.addHandler(handler) + else: + self.__log_folder = "" + self.__log_name = "" + self.__log_dirpath = "" + self.__log_path = "" + + # register this component and a default logger + module_name = self.__get_readable_name(__file__, 0) + self.__default_logger: logging.Logger = logging.root.getChild(module_name) + self.__default_logger.setLevel(logging.DEBUG if env.is_debug() else logging.WARNING) + self.__registered_logger_names = set() + + @property + def default_logging_level(self) -> int: + return self.__default_logger.level + + @property + def log_folder(self) -> str: + return self.__log_folder + + @property + def log_filename(self) -> str: + return self.__log_name + + @property + def log_dirpath(self) -> str: + return self.__log_dirpath + + @property + def log_path(self) -> str: + return self.__log_path + + @property + def log_time_format(self) -> str: + return self.__log_time_format + + @property + def dir_time_format(self) -> str: + return self.__dir_time_format + + def __get_comp_logger(self, comp_name: str) -> logging.Logger | None: + return ( + logging.root.getChild(comp_name) + if comp_name in self.__registered_logger_names + else None + ) + + def __get_comp_logger_or_default(self, comp_name: str | None) -> logging.Logger: + logger = None + if comp_name is not None: + logger = self.__get_comp_logger(comp_name) + return self.__default_logger if logger is None else logger + + def __register_comp_logger(self, comp_name: str, level: str | int | None) -> None: + if comp_name in self.__registered_logger_names: + return + self.__registered_logger_names.add(comp_name) + self.__default_logger.getChild(comp_name).setLevel( + level if level is not None else logging.NOTSET + ) + + def set_default_logging_level(self, level: str | int | None) -> int: + self.__default_logger.setLevel(level if level is not None else logging.NOTSET) + return self.__default_logger.level + + def set_component_logging_level(self, comp_name: str, level: str | int | None) -> int: + logger = self.__get_comp_logger(comp_name) + assert logger is not None, comp_logger.log( + logging.ERROR, f"Component {comp_name} not registered" + ) + logger.setLevel(level if level is not None else logging.NOTSET) + return logger.getEffectiveLevel() + + def __get_readable_name(self, comp_name: str, name_level: int) -> str: + """ + Get more human-readable name, interpreted from input component name (which is likely to be + __file__ by design). + + Input Args: + `comp_name`: input name, likely to be __file__ of corresponding component + `name_level`: level of name to be returned, indicating number of directory levels + included in front of the component name. + + Returns: + more human-readable component name + """ + comp_name = os.path.abspath(comp_name) + + dir_names = [] + comp_dir = os.path.dirname(comp_name) + for _ in range(name_level): + dir_names.append(os.path.basename(comp_dir)) + comp_dir = os.path.dirname(comp_dir) + dir_name = " / ".join(dir_names).replace("_", " ").replace("-", " ") + + comp_name = os.path.basename(comp_name).split(".")[0] + comp_name = comp_name.replace("_", " ").replace("-", " ") + + if len(dir_name) != 0: + comp_name = f"{dir_name} / {comp_name}" + if " " in comp_name: + # for names with snake_case or kebab-case + comp_name = " ".join([word.capitalize() for word in comp_name.split()]) + else: + # for names with camelCase or PascalCase + split_idxs = [0, *[i for i, c in enumerate(comp_name) if c.isupper()], len(comp_name)] + comp_name = " ".join( + [ + comp_name[split_idxs[i] : split_idxs[i + 1]].capitalize() + for i in range(len(split_idxs) - 1) + ] + ) + return comp_name + + def register_component( + self, + comp_name: str, + level: str | int | None = None, + auto_readable: bool = True, + name_level: int = 0, + ) -> CompLogger: + """ + Register a component with a name and logging level. + + If `auto_readable` is True, the component name will be converted to a more human-readable + form, which is interpreted from the input component name (which is likely to be __file__ by + design). The human-readable name will be formatted as: + "[dir1] / [dir2] / ... / [component_name]", where [dir1], [dir2], ... are the directory + names of the component file, and [component_name] is the name of the component file without + extension, with underscores replaced by spaces and each word capitalized. The number of + directory levels included in the name is determined by `name_level`. If a custom name is + desired, set `auto_readable` to False and `comp_name` to the desired name. + + Args: + `comp_name`: name of the component, likely to be __file__ of corresponding component + `level`: logging level for this component, default to None (which means NOTSET) + `auto_readable`: whether to convert the component name to a more human-readable form + (default True) + `name_level`: level of name to be returned, indicating number of directory levels + included in front of the component name (default 1) + + Returns: + A CompLogger instance for the component. + + Raises: + AssertionError: if the component name is already registered. + """ + if auto_readable: + comp_name = self.__get_readable_name(comp_name, name_level) + assert self.__get_comp_logger(comp_name) is None, comp_logger.log( + logging.ERROR, f"Component name {comp_name} is registered twice" + ) + self.__register_comp_logger(comp_name, level) + return CompLogger(comp_name) + + def get_component_logging_header(self) -> str: + return f"<%s> " + + def component_should_log(self, comp_name: str | None, level: int) -> bool: + logger = None + if comp_name is not None: + logger = self.__get_comp_logger(comp_name) + logger = self.__default_logger if logger is None else logger + return logger.isEnabledFor(level) + + def log( + self, comp_name: str | None, level: int, msg: str, *args, stacklevel=3, **kwargs + ) -> None: + """ + Log a message with the specified component name and logging level. + + Args: + comp_name: The name of the component. + level: The logging level. + msg: The message to log. + *args: Additional arguments to pass to the logger. + stacklevel: The stack level to use for the logger, default to be 3 assuming calling from + component logger. + **kwargs: Additional keyword arguments to pass to the logger. + """ + header: str = self.get_component_logging_header() if comp_name is not None else "" + logger: logging.Logger = self.__get_comp_logger_or_default(comp_name) + logger.log(level, header + msg, comp_name, *args, stacklevel=stacklevel, **kwargs) + + +class CompLogger: + def __init__(self, comp_name: str): + self.__comp_name = comp_name + self.__logger = Logger() + + def log(self, level: int, msg: str, *args, **kwargs) -> None: + self.__logger.log(self.__comp_name, level, msg, *args, **kwargs) + + @property + def comp_name(self) -> str: + return self.__comp_name + + def get_augmented_message(self, msg: str) -> str: + header: str = ( + Logger().get_component_logging_header() if self.__comp_name is not None else "" + ) + return header % self.__comp_name + msg + + +comp_logger = Logger().register_component(__file__) + +import re +import sys +import atexit +import signal +import traceback + +# saving the default exception handler +default_excepthook = sys.excepthook + + +def exc_handler(exctype, value, tb): + """ + Replaced exception handler, added the functionality to log the exception and raise a SIGABRT. + """ + # remove default stream handler so exceptions do not print to stderr + for handler in logging.root.handlers: + if isinstance(handler, logging.StreamHandler): + logging.root.removeHandler(handler) + + # invoke the default exception hook + default_excepthook(exctype, value, tb) + + log_filename = Logger().log_path + # only log when there is a log file + if len(log_filename) != 0: + # format message and log it + msg = re.subn(r"%", r"%%", "".join(traceback.format_exception(exctype, value, tb)))[0] + comp_logger.log(logging.FATAL, msg) + # comp_logger must exist at this point + # NOTE a ":0" is appended so that the colon in filename does not confuse some smart file path + # resolvers (e.g. in VSCode) + cprint.wprintf( + comp_logger.get_augmented_message(f"Log saved to file {log_filename}:0"), + file=sys.stderr, + ) + # raise sigterm to signal program termination + signal.raise_signal(signal.SIGABRT) + + +# register the custom exception handler with system +sys.excepthook = exc_handler + + +def log_time_breakdown(tag: str): + time_ns = time.monotonic_ns() + with open(os.path.join(Logger().log_dirpath, "time_break_down.txt"), "a") as fout: + fout.write(f"{tag}, {time_ns}\n") + return + + +def save_config_to_log_dir(config_path: str): + """Copy the given config file into log_dirpath/config/""" + log_dir = Logger().log_dirpath + config_dir = os.path.join(log_dir, "config") + os.makedirs(config_dir, exist_ok=True) + + # Copy config file + dest_path = os.path.join(config_dir, os.path.basename(config_path)) + shutil.copy2(config_path, dest_path) + print(f"[INFO] Config file saved to {dest_path}") diff --git a/src/utils/python_utils.py b/src/utils/python_utils.py new file mode 100644 index 0000000..30535ba --- /dev/null +++ b/src/utils/python_utils.py @@ -0,0 +1,115 @@ +import os +import sys +import hashlib +import subprocess +from typing import Protocol, runtime_checkable + + +# NOTE [Python typing on SupportsRead and SupportsWrite]: +# The typing module does not provide SupportsRead and SupportsWrite instances for runtime type +# checking, a custom implementation is provided below. The str version for SupportsRead and +# SupportsWrite is provided below. +@runtime_checkable +class SupportsReadStr(Protocol): + def read(self, size: int | None = -1, /) -> str: ... + + +@runtime_checkable +class SupportsWriteStr(Protocol): + def write(self, data: str, /) -> None: ... + + +def get_script_path(file: str) -> str: + return os.path.realpath(file) + + +def get_script_dir(file: str) -> str: + return os.path.dirname(get_script_path(file)) + + +def printerr(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + + +def safeval(val, default): + if val: + return val + return default + + +def hash_file(algo, path): + assert algo in hashlib.algorithms_available, f"Hash algorithm {algo} is not supported" + assert os.path.isfile(path), f"File \"{path}\" does not exist (Working dir \"{os.getcwd()}\")" + bufsize = 131072 + hs = hashlib.new(algo) + buf = bytearray(bufsize) + mv = memoryview(buf) + with open(path, "rb", buffering=0) as fin: + while nbytes := fin.readinto(mv): + hs.update(mv[:nbytes]) + return hs.hexdigest() + + +dependency_check_funcs = { + "f": lambda s: os.path.isfile(s), + "d": lambda s: os.path.isdir(s), + "e": lambda s: os.access(s, os.F_OK), + "r": lambda s: os.access(s, os.R_OK), + "w": lambda s: os.access(s, os.W_OK), + "x": lambda s: os.access(s, os.X_OK), +} + + +def check_dependency(test_type, dependency): + func = dependency_check_funcs.get(test_type) + assert func is not None, f"Invalid type string {test_type}" + + if func(dependency): + return os.path.realpath(dependency) + + +def get_by_path(obj, path): + keys = path.split('.') + for key in keys: + try: + key = int(key) + except ValueError: + pass + obj = obj[key] + return obj + + +# def set_by_path(obj, path, value): +# keys = path.split('.') +# for key in keys[:-1]: +# try: +# key = int(key) +# except ValueError: +# pass +# obj = obj[key] +# last_key = keys[-1] +# try: +# last_key = int(last_key) +# except ValueError: +# pass +# obj[last_key] = value + + +def find_device_for_path(path: str, device_name_only: bool = True) -> str | None: + path = os.path.realpath(path) + if not os.path.exists(path): + return None + dev = os.stat(path).st_dev + + with open("/proc/mounts") as f: + for line in f: + line_split = line.split() + target_dev, mount_point, *_ = line_split + try: + if os.stat(mount_point).st_dev == dev: + if device_name_only: + return os.path.basename(target_dev) + return target_dev + except Exception: + continue + return None diff --git a/src/vectordb/DBInstance.py b/src/vectordb/DBInstance.py new file mode 100644 index 0000000..6336432 --- /dev/null +++ b/src/vectordb/DBInstance.py @@ -0,0 +1,60 @@ +from abc import ABC, abstractmethod + + +# the db instance +# one db instance may contain multiple collections, have a default collection here +class DBInstance(ABC): + def __init__(self, **kwargs): + self.db_path = kwargs.get("db_path", None) + self.collections = kwargs.get("collections", []) + self.default_collection = kwargs.get("collection_name", None) + # self.device = kwargs.get("device", "cpu") + self.drop_previous_collection = kwargs.get("drop_previous_collection", False) + self.client = None + + @abstractmethod + def setup(self): + pass + + # collection related + @abstractmethod + def create_collection(self, collection_name): + # Create a new collection in the database. + pass + + @abstractmethod + def has_collection(self, collection_name): + """ + Check if the collection exists in the database. + :param collection_name: Name of the collection to check. + :return: True if the collection exists, False otherwise. + """ + pass + + @abstractmethod + def drop_collection(self, collection_name): + """ + Drop the specified collection from the database. + :param collection_name: Name of the collection to drop. + """ + pass + + @abstractmethod + def insert_data(self, vectors, chunks, collection_name=None): + """ + Insert data into the database. + :param vectors: Embeddings to be inserted. + :param chunks: Corresponding text chunks. + """ + pass + + @abstractmethod + def query_search(self, query_vector, collection_name=None): + pass + + # @abstractmethod + # def close(self): + # """ + # Close the database connection. + # """ + # pass diff --git a/src/vectordb/README.md b/src/vectordb/README.md new file mode 100644 index 0000000..4158a23 --- /dev/null +++ b/src/vectordb/README.md @@ -0,0 +1,136 @@ + +# Vector Database Module + +This module provides a unified interface for interacting with various locally deployable Vector Databases. RASB abstracts the low-level client management, allowing you to switch between different backends (e.g., changing from LanceDB to Milvus) by simply modifying a configuration file. + +## 📦 Supported Databases + +We currently support the following vector databases and index types: + +| Database | Supported Index Types | Device Support | Notes | +| :--- | :--- | :--- | :--- | +| **LanceDB** | IVF-PQ, IVF-Flat, HNSW | CPU/GPU | Embedded, serverless, highly memory efficient. | +| **Milvus** | HNSW, IVF, DiskANN, ScaNN | CPU/GPU | Requires a running server instance (Docker/K8s). | +| **Qdrant** | HNSW | CPU/GPU | Requires a running server instance. | +| **Chroma** | HNSW | CPU | Embedded or Client/Server. | +| **Elasticsearch** | HNSW, Flat | CPU | Requires a running server instance. | + +--- + +## 🛠️ Setup Instructions by Type + +Before running the benchmark, ensure you have installed the necessary Python drivers. If you followed the main installation guide, these should already be in your environment. + +### 1. LanceDB (Recommended for Local Testing) +LanceDB runs in-process and does not require a separate server. + +* **Prerequisites:** `pip install lancedb` +* **Storage:** Data is stored in a local directory (e.g., `./lancedb_data`). + +**Configuration (`config/your_config.yaml`):** +```yaml +vector_db: + type: "lancedb" + db_path: "/mnt/data/my_lancedb" # Path to store the database files + collection_name: "wiki_vectors" +``` + +### 2. Milvus (GPU via Docker Compose) + +If you plan to use **Milvus** as the vector store, follow the official guide to run Milvus with GPU support using Docker Compose: +➡️ **[Run Milvus with GPU Support Using Docker Compose](https://milvus.io/docs/install_standalone-docker-compose-gpu.md)** + +After Milvus is up, point your pipeline config to its url: +```yaml + vector_db: + collection_name: 'milvus_test' + db_path: http://localhost:19530 + db_token: root:Milvus + drop_previous_collection: false + type: milvus +``` + +3. Qdrant (Docker) +To use Qdrant, run the official Docker container. This exposes the database on port 6333. + +```bash +docker run -p 6333:6333 -p 6334:6334 \ + -v $(pwd)/qdrant_storage:/qdrant/storage:z \ + qdrant/qdrant +``` +Change the configuration to use Qdrant: +```yaml + vector_db: + type: "qdrant" + db_path: "http://localhost:6333" # Qdrant server URL + collection_name: "test_collection" + # Qdrant doesn't typically need a token for local docker, but if configured: + # db_token: "your-api-key" +``` + +4. Chroma (Embedded or Client/Server) +Chroma is often used in an embedded mode (similar to LanceDB) but can also run as a server. The default setup here assumes a persistent local storage mode. + +* **Prerequisites:** `pip install chroma_db` +* **Storage:** Data is stored in a local directory (e.g., `./chroma_data`). + +```yaml + vector_db: + type: "chroma" + db_path: "./chroma_data" # Local path for persistence + collection_name: "chroma_test" +``` + +5. Elasticsearch (Docker with kNN) +Elasticsearch supports dense vector search natively. Ensure you have the necessary memory allocated to Docker. + +* Run Elasticsearch with docker: + ```bash + docker run -p 9200:9200 -e "discovery.type=single-node" \ + -e "xpack.security.enabled=false" \ + -m 4GB docker.elastic.co/elasticsearch/elasticsearch:8.11.1 +``` +Configuration: +```yaml + vector_db: + type: "elasticsearch" + db_path: "http://localhost:9200" + collection_name: "elastic_test" + drop_previous_collection: true # Elastic indices often need fresh creation for mapping changes +``` + + +## Adding a New Vector Database +This pipeline uses an abstract base class, DBInstance (defined in [DBInstance.py](./DBInstance.py)), to enforce a consistent API across all vector stores. To add support for a new database (e.g., Weaviate, Pinecone), follow these steps: + +1. Create a New Class: Create a new file (e.g., MyNewDB.py) in vectordb. +2. Inherit form DBInstance: Implement all abstract methods defined in the base class. +```python +from .DBInstance import DBInstance + +class MyNewDB(DBInstance): + def setup(self): + # Initialize client connection + pass + + def create_collection(self, collection_name): + # creating a collection + pass + + def has_collection(self, collection_name): + # Check existence + pass + + def drop_collection(self, collection_name): + # Clean up + pass + + def insert_data(self, vectors, chunks, collection_name=None): + # Batch insertion logic + pass + + def query_search(self, query_vector, collection_name=None): + # Search logic returning top_k results + pass +``` +3. Register the Class: Add your new class to the in `run_new.py` so it can be instantiated via the config type string. diff --git a/src/vectordb/__init__.py b/src/vectordb/__init__.py new file mode 100644 index 0000000..c0d0c4e --- /dev/null +++ b/src/vectordb/__init__.py @@ -0,0 +1,2 @@ +# vectordb/__init__.py +# Empty file, just makes Python treat this as a package diff --git a/src/vectordb/chroma_api.py b/src/vectordb/chroma_api.py new file mode 100644 index 0000000..ea1facc --- /dev/null +++ b/src/vectordb/chroma_api.py @@ -0,0 +1,320 @@ +# took from lancedb_api +import argparse +import sys, os +import random +from tqdm import tqdm +import re +import concurrent.futures +import lancedb + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.reverse() +# from monitor import MetricMonitorProcess +from vectordb.DBInstance import DBInstance + +# chroma_api specific +import chromadb +from concurrent.futures import ThreadPoolExecutor + + +class chroma_client(DBInstance): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.id_num = 0 + + def setup(self): + self.client = chromadb.PersistentClient(path=self.db_path) + print(f"***Connected to Qdrant client at {self.db_path}\n") + + def has_collection(self, collection_name): + collections = self.client.list_collections() + collection_names = [c.name for c in collections] + + if collection_name in collection_names: + print(f"***Collection: {collection_name} exists.") + return True + else: + print(f"***Collection: {collection_name} does not exist.") + return False + + def create_collection(self, collection_name, dim, consistency_level="Eventually", auto_id=True): + if self.has_collection(collection_name=collection_name): + print(f"***Collection: {collection_name} already exists.") + return + else: + try: + collection = self.client.create_collection(name=collection_name) + print(f"***Created new collection: {collection_name}") + return + except Exception as e: + print(f"***Failed to create collection: {collection_name}. Error: {e}") + return + + def drop_collection(self, collection_name): + self.client.delete_collection(name=collection_name) + print(f"***Dropped existing collection: {collection_name}") + + def insert_data_vector( + self, + vector, + chunks, + collection_name=None, + insert_batch_size=1, + strict_check=False, + create_collection=False, + ): + if len(vector) != len(chunks): + # raise ValueError(f"Vectors length {len(vector)} != Chunks length {len(chunks)}") + # make the number to the smaller one + min_len = min(len(vector), len(chunks)) + vector = vector[:min_len] + chunks = chunks[:min_len] + + if self.has_collection(collection_name=collection_name) is False: + self.create_collection(collection_name=collection_name, dim=len(vector[0])) + else: + self.drop_collection(collection_name=collection_name) + self.create_collection(collection_name=collection_name, dim=len(vector[0])) + + collection = self.client.get_collection(name=collection_name) + + # Build list of points, one per record + id_list = [] + embeddings_list = [] + documents_list = [] + for v, c in zip(vector, chunks): + id_list.append(str(self.id_num)) + embeddings_list.append(v) + documents_list.append(c) + self.id_num += 1 + + # print(f"***Start insert: {len(point_list)}") + + for i in tqdm(range(0, int(len(id_list)), insert_batch_size)): + collection.add( + ids=id_list[i : i + insert_batch_size], + embeddings=embeddings_list[i : i + insert_batch_size], + documents=documents_list[i : i + insert_batch_size], + ) + print(f"***Insert done.") + + # return result + + def insert_data( + self, dict_list, collection_name=None, insert_batch_size=1, create_collection=False + ): + total_chunks_num = len(dict_list) + print(f"***Start insert: {total_chunks_num}") + + collection = self.client.get_collection(name=collection_name) + + id_list = [] + embeddings_list = [] + documents_list = [] + for d in dict_list: + id_list.append(str(self.id_num)) + embeddings_list.append(d["vector"]) + documents_list.append(d["text"]) + self.id_num += 1 + + batch_size = 1000 + for i in tqdm(range(0, len(id_list), batch_size)): + collection.add( + ids=id_list[i : i + batch_size], + embeddings=embeddings_list[i : i + batch_size], + documents=documents_list[i : i + batch_size], + ) + + print(f"***Insert done.") + + # def show_table(self, collection_name=None): + # tbl = self.client.open_table(collection_name) + # print(tbl.to_pandas()) + + def query_search( + self, + query_vector, + topk, + collection_name=None, + search_batch_size=1, + multithread=False, + max_threads=4, + consistency_level="Eventually", + output_fields=["text", "vector"], + monitor=False, + ): + print(f"***Start query search in collection: {collection_name}") + + total_queries = len(query_vector) + + # Adjust search_batch_size if it exceeds total_queries + if search_batch_size > total_queries: + search_batch_size = total_queries + + results = [None] * total_queries + + num_batches = (total_queries + search_batch_size - 1) // search_batch_size + + collection = self.client.get_collection(name=collection_name) + + def search_thread(start_idx, end_idx): + b_vectors = query_vector[start_idx:end_idx] + + if len(b_vectors) > 0: + mres = collection.query( + query_embeddings=query_vector[start_idx:end_idx], n_results=topk + ) + results[start_idx:end_idx] = mres["documents"] + + # start_time = time.time() + # print(f"*** Start multithreaded search: total={self.retrieval_size}, batch_size={batch_size}, max_threads={max_threads}") + if max_threads == 1 or not multithread: + # Single-threaded search + for i in tqdm(range(total_queries), desc="Searching batches"): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + + # q_embeddings = [] + # for vec in range(start_idx, end_idx): + # b_vectors.append(models.QueryRequest(query=query_vector[vec], limit=topk, with_payload=True)) + # b_results = ( + # self.client.query_points(collection_name=collection_name, query=b_vectors, limit=topk) + # .nprobes(1) + # .to_list() + # ) + # results[start_idx:end_idx] = b_results + + b_vectors = query_vector[start_idx:end_idx] + if len(b_vectors) > 0: + mres = collection.query( + query_embeddings=query_vector[start_idx:end_idx], n_results=topk + ) + results[start_idx:end_idx] = mres["documents"] + # results[i] = self.client.query_points(collection_name=collection_name, query=query_vector[i], limit=topk) + else: + with ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = [] + progress = tqdm(total=num_batches, desc="Searching batches") + + def callback(future): + progress.update(1) + + for i in range(num_batches): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + future = executor.submit(search_thread, start_idx, end_idx) + future.add_done_callback(callback) + futures.append(future) + + concurrent.futures.wait(futures) + progress.close() + + # end_time = time.time() + context_format = """Source #{source_idx}\nDetail: {source_detail}\n""" + contexts_results = [] + with open("query.out", "w") as fout: + for query_idx, query_results in enumerate(results): + fout.write(f"=== Query #{query_idx + 1} Results ===\n") + context = [] + + if query_idx == 1: + print(query_results) + + for entry_idx, result in enumerate(query_results): + text = result + formatted = context_format.format(source_idx=entry_idx, source_detail=text) + context.append(formatted) + fout.write(f"*** Retrieved result #{entry_idx}, doc length: {len(text)}\n") + fout.write("\n") + contexts_results.append(context) + + print(f"***Query search completed.") + return contexts_results + + def build_index( + self, + collection_name, + index_type, + metric_type, + num_partitions=256, + num_sub_vectors=96, + idx_name=None, + drop_index=True, + device=None, + index_cache_size=None, + ): + # Color print the index metrics used + print(f"Building index with parameters:", "cyan") + print(f" index_type: {index_type}", "green") + + # tbl = self.client.open_table(collection_name) + # tbl.create_index( + # metric=metric_type, + # num_partitions=num_partitions, + # num_sub_vectors=num_sub_vectors, + # vector_column_name='vector', + # replace=drop_index, + # accelerator=device, + # index_cache_size=32, + # index_type=index_type, + # num_bits=8, + # max_iterations=50, + # sample_rate=256, + # m=20, + # ef_construction=300, + # ) + + return + + +# test +if __name__ == "__main__": + print("Chroma client test") + # change qdrant path to a local on + chroma = chroma_client( + db_path="/mnt/data1/shaobol2/chroma", + collection_name="test_collection", + dim=768, + index_type="IVF_PQ", + metric_type="L2", + ) + + chroma.setup() + # lc.drop_collection("test_collection") + chroma.create_collection("test_collection", dim=768) + # test insertion + dict_list = [] + for i in range(10000): + dict_list.append( + { + "text": f"Sample text {i}", + "vector": [random.random() for _ in range(768)], # Example vector of size 768 + } + ) + chroma.insert_data( + dict_list, collection_name="test_collection", insert_batch_size=10, create_collection=False + ) + # lc.show_table("test_collection") + # qc.build_index( + # "test_collection", + # index_type="IVF_HNSW_PQ", + # metric_type="L2", + # num_partitions=256, + # num_sub_vectors=96, + # drop_index=True, + # device=None, + # index_cache_size=None, + # ) + results = chroma.query_search( + query_vector=[[random.random() for _ in range(768)] for _ in range(2)], + topk=2, + collection_name="test_collection", + search_batch_size=2, + multithread=False, + max_threads=4, + consistency_level="Eventually", + output_fields=["text", "vector"], + monitor=True, + ) + print("Query results:") + print(results) diff --git a/src/vectordb/elastic_api.py b/src/vectordb/elastic_api.py new file mode 100644 index 0000000..f8d6e1a --- /dev/null +++ b/src/vectordb/elastic_api.py @@ -0,0 +1,304 @@ +# took from lancedb_api +import argparse +import sys, os +import random +from tqdm import tqdm +import re +import concurrent.futures +import lancedb + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.reverse() +from vectordb.DBInstance import DBInstance + +# elastic_api specific +from elasticsearch import Elasticsearch +from elasticsearch.helpers import bulk, BulkIndexError + +# local development installation in Docker: +# curl -fsSL https://elastic.co/start-local | sh + + +class elastic_client(DBInstance): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def setup(self): + self.client = Elasticsearch(self.db_path, basic_auth=("elastic", "3C8zBzzx")) + print(f"***Connected to Elasticsearch client at {self.db_path}\n") + + def has_collection(self, collection_name): + if self.client.indices.exists(index=collection_name.lower()): + print(f"***Collection: {collection_name} exists.") + return True + else: + print(f"***Collection: {collection_name} does not exist.") + return False + + def create_collection(self, collection_name, dim, consistency_level="Eventually", auto_id=True): + if self.client.indices.exists(index=collection_name.lower()): + self.client.indices.delete(index=collection_name.lower()) + # print(f"***Collection: {collection_name} already exists.") + # return + # else: + try: + mapping = { + "mappings": { + "properties": { + "embedding": { + "type": "dense_vector", + "dims": dim, + "index": True, + "similarity": "cosine", + } + } + } + } + + b = self.client.indices.create(index=collection_name.lower(), body=mapping) + print(f"***Created new collection: {collection_name}") + return + except Exception as e: + print(f"***Failed to create collection: {collection_name}. Error: {e}") + return + + def drop_collection(self, collection_name): + self.client.indices.delete(index=collection_name.lower()) + print(f"***Dropped existing collection: {collection_name}") + + def insert_data_vector( + self, + vector, + chunks, + collection_name=None, + insert_batch_size=1, + strict_check=False, + create_collection=False, + ): + if len(vector) != len(chunks): + raise ValueError(f"Vectors length {len(vector)} != Chunks length {len(chunks)}") + + # Build list of points, one per record + for i in tqdm(range(0, int(len(vector)), insert_batch_size)): + dict_list = [] + for v, c in zip(vector[i : i + insert_batch_size], chunks[i : i + insert_batch_size]): + record = {"_index": collection_name.lower(), "_source": {"text": c, "embedding": v}} + dict_list.append(record) + + # print(f"***Start insert batch: {len(dict_list)}") + bulk(self.client, dict_list) + print(f"***Insert batch done.") + + def insert_data( + self, dict_list, collection_name=None, insert_batch_size=1, create_collection=False + ): + total_chunks_num = len(dict_list) + + new_dict_list = [] + for d in dict_list: + record = { + "_index": collection_name.lower(), + "_source": {"text": d["text"], "embedding": d["vector"]}, + } + new_dict_list.append(record) + + print(f"***Start insert: {total_chunks_num}") + + try: + bulk(self.client, new_dict_list) + except BulkIndexError as e: + for error in e.errors: + print(error) + + print(f"***Insert done.") + + # def show_table(self, collection_name=None): + # tbl = self.client.open_table(collection_name) + # print(tbl.to_pandas()) + + def query_search( + self, + query_vector, + topk, + collection_name=None, + search_batch_size=1, + multithread=False, + max_threads=4, + consistency_level="Eventually", + output_fields=["text", "vector"], + monitor=False, + ): + print(f"***Start query search in collection: {collection_name}") + + total_queries = len(query_vector) + + # Adjust search_batch_size if it exceeds total_queries + if search_batch_size > total_queries: + search_batch_size = total_queries + + results = [None] * total_queries + + num_batches = (total_queries + search_batch_size - 1) // search_batch_size + + # def search_thread(start_idx, end_idx): + # b_vectors = query_vector[start_idx:end_idx] + + # # b_results = tbl.search(b_vectors, vector_column_name='vector').limit(topk).nprobes(3).to_list() + # b_results = self.client.query_points(collection_name=collection_name, query=b_vectors, with_payload=False, limit=topk).points + + # results[start_idx:end_idx] = b_results + + # start_time = time.time() + # print(f"*** Start multithreaded search: total={self.retrieval_size}, batch_size={batch_size}, max_threads={max_threads}") + if max_threads == 1 or not multithread: + # Single-threaded search + for i in tqdm(range(num_batches), desc="Searching batches"): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + + b_vectors = [] + for vec in range(start_idx, end_idx): + b_vectors.append({}) + b_vectors.append( + { + "knn": { + "field": "embedding", + "query_vector": query_vector[vec], + "k": topk, + "num_candidates": 100, + } + } + ) + mres = self.client.msearch(index=collection_name.lower(), searches=b_vectors) + responses = mres["responses"] + results[start_idx:end_idx] = responses + # if start_idx == 0: + # print(results[start_idx]["hits"]["hits"]) + # results[i] = self.client.query_points(collection_name=collection_name, query=query_vector[i], limit=topk) + else: + # with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + # futures = [] + # progress = tqdm(total=num_batches, desc="Searching batches") + + # def callback(future): + # progress.update(1) + + # for i in range(num_batches): + # start_idx = i * search_batch_size + # end_idx = min(start_idx + search_batch_size, total_queries)d + # future = executor.submit(search_thread, start_idx, end_idx) + # future.add_done_callback(callback) + # futures.append(future) + + # concurrent.futures.wait(futures) + # progress.close() + print(("DEFAULT MULTITHREADING")) + + # print(results) + + # end_time = time.time() + context_format = """Source #{source_idx}\nDetail: {source_detail}\n""" + contexts_results = [] + with open("query.out", "w") as fout: + for query_idx, query_results in enumerate(results): + fout.write(f"=== Query #{query_idx + 1} Results ===\n") + context = [] + + for entry_idx, result in enumerate(query_results["hits"]["hits"]): + text = result["_source"]["text"] + formatted = context_format.format(source_idx=entry_idx, source_detail=text) + context.append(formatted) + fout.write(f"*** Retrieved result #{entry_idx}, doc length: {len(text)}\n") + fout.write("\n") + contexts_results.append(context) + + print(f"***Query search completed.") + return contexts_results + + def build_index( + self, + collection_name, + index_type, + metric_type, + num_partitions=256, + num_sub_vectors=96, + idx_name=None, + drop_index=True, + device=None, + index_cache_size=None, + ): + # Color print the index metrics used + print(f"Building index with parameters:", "cyan") + print(f" index_type: {index_type}", "green") + + # tbl = self.client.open_table(collection_name) + # tbl.create_index( + # metric=metric_type, + # num_partitions=num_partitions, + # num_sub_vectors=num_sub_vectors, + # vector_column_name='vector', + # replace=drop_index, + # accelerator=device, + # index_cache_size=32, + # index_type=index_type, + # num_bits=8, + # max_iterations=50, + # sample_rate=256, + # m=20, + # ef_construction=300, + # ) + + return + + +# test +if __name__ == "__main__": + print("Elastic client test") + # change qdrant path to a local on + elastic = elastic_client( + db_path="http://localhost:9200", + collection_name="test_collection", + dim=768, + index_type="IVF_PQ", + metric_type="L2", + ) + + elastic.setup() + # lc.drop_collection("test_collection") + elastic.create_collection("test_collection", dim=768) + # test insertion + dict_list = [] + for i in range(10000): + dict_list.append( + { + "text": f"Sample text {i}", + "vector": [random.random() for _ in range(768)], # Example vector of size 768 + } + ) + elastic.insert_data( + dict_list, collection_name="test_collection", insert_batch_size=10, create_collection=False + ) + # lc.show_table("test_collection") + # qc.build_index( + # "test_collection", + # index_type="IVF_HNSW_PQ", + # metric_type="L2", + # num_partitions=256, + # num_sub_vectors=96, + # drop_index=True, + # device=None, + # index_cache_size=None, + # ) + results = elastic.query_search( + query_vector=[[random.random() for _ in range(768)] for _ in range(2)], + topk=2, + collection_name="test_collection", + search_batch_size=2, + multithread=False, + max_threads=4, + consistency_level="Eventually", + output_fields=["text", "vector"], + monitor=True, + ) + print("Query results:") + print(results) diff --git a/src/vectordb/lancedb_api.py b/src/vectordb/lancedb_api.py new file mode 100644 index 0000000..88d2104 --- /dev/null +++ b/src/vectordb/lancedb_api.py @@ -0,0 +1,375 @@ +import argparse +import sys, os +import random +from tqdm import tqdm +import re +import concurrent.futures +import lancedb +import pyarrow as pa + + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.reverse() +from vectordb.DBInstance import DBInstance + + +class lance_client(DBInstance): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.type = "lancedb" + + def setup(self): + self.client = lancedb.connect(self.db_path) + print(f"***Connected to Lancedb client at {self.db_path}\n") + + def has_collection(self, collection_name): + print( + f"lancedb may not support has_collection, please check the collection exists by list_collections" + ) + return True + + def create_collection( + self, collection_name, dim, consistency_level="Eventually", auto_id=True, data_type="text" + ): + try: + if self.drop_collection(collection_name): + print(f"***Dropped existing collection: {collection_name}") + except Exception as e: + print(f"***No existing collection to drop: {collection_name}. Error: {e}") + pass + + if data_type == "image": + schema = pa.schema( + [ + pa.field("vector", pa.list_(pa.float32(), dim)), + pa.field("seq_id", pa.int32()), + pa.field("doc_id", pa.int32()), + pa.field("filepath", pa.string()), + ] + ) + elif data_type == "text": + schema = pa.schema( + [pa.field("text", pa.string()), pa.field("vector", pa.list_(pa.float32(), dim))] + ) + try: + self.client.create_table( + collection_name, + data=None, + schema=schema, + mode='create', + exist_ok=False, + on_bad_vectors='error', + fill_value=0, + ) + print(f"***Created new collection: {collection_name}") + return + except Exception as e: + print(f"***Failed to create collection: {collection_name}. Error: {e}") + return + + def drop_collection(self, collection_name): + self.client.drop_table(collection_name) + print(f"***Dropped existing collection: {collection_name}") + + def insert_data_vector( + self, + vector, + chunks, + collection_name=None, + insert_batch_size=1, + strict_check=False, + create_collection=False, + ): + if len(vector) != len(chunks): + raise ValueError(f"Vectors length {len(vector)} != Chunks length {len(chunks)}") + + # Build list of dicts, one per record + dict_list = [] + for v, c in zip(vector, chunks): + record = {"vector": v, "text": c} + dict_list.append(record) + + print(f"***Start insert: {len(dict_list)}") + tbl = self.client.open_table(collection_name) + result = tbl.add(dict_list, mode="append", on_bad_vectors="error") + print(f"***Insert done.") + return result + + def insert_data( + self, dict_list, collection_name=None, insert_batch_size=1, create_collection=False + ): + total_chunks_num = len(dict_list) + print(f"***Start insert: {total_chunks_num}") + tbl = self.client.open_table(collection_name) + result = tbl.add(dict_list, mode='append', on_bad_vectors='error') + print(f"***Insert done.") + + def show_table(self, collection_name=None): + tbl = self.client.open_table(collection_name) + print(tbl.to_pandas()) + + def query_search( + self, + query_vector, + topk, + collection_name=None, + search_batch_size=1, + multithread=False, + max_threads=4, + consistency_level="Eventually", + output_fields=["text", "vector"], + ): + print(f"***Start query search in collection: {collection_name}") + + tbl = self.client.open_table(collection_name) + + total_queries = len(query_vector) + + # Adjust search_batch_size if it exceeds total_queries + if search_batch_size > total_queries: + search_batch_size = total_queries + + results = [None] * total_queries + + num_batches = (total_queries + search_batch_size - 1) // search_batch_size + + def search_thread(start_idx, end_idx): + b_vectors = query_vector[start_idx:end_idx] + + # b_results = tbl.search(b_vectors, vector_column_name='vector').limit(topk).nprobes(3).to_list() + b_results = tbl.search(b_vectors, vector_column_name='vector').limit(topk).to_list() + + results[start_idx:end_idx] = b_results + + # start_time = time.time() + # print(f"*** Start multithreaded search: total={self.retrieval_size}, batch_size={batch_size}, max_threads={max_threads}") + if max_threads == 1 or not multithread: + # Single-threaded search + for i in tqdm(range(num_batches), desc="Searching batches"): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + b_vectors = query_vector[start_idx:end_idx] + b_results = ( + tbl.search(b_vectors, vector_column_name='vector') + # .distance_type("l2") + .limit(topk) + # .nprobes(1) + .to_list() + ) + # tbl.search(np.random.random((1536))).distance_type("cosine").limit(10).to_list() + # b_results = tbl.search(b_vectors, vector_column_name='vector').limit(topk).to_list() + if len(b_results) != search_batch_size * topk: + raise ValueError( + f"len(b_results) must be n*topk n = {search_batch_size}, topk {topk}, but got {len(b_results)}" + ) + b_results = [b_results[i * topk : (i + 1) * topk] for i in range(search_batch_size)] + results[start_idx:end_idx] = b_results + else: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = [] + progress = tqdm(total=num_batches, desc="Searching batches") + + def callback(future): + progress.update(1) + + for i in range(num_batches): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + future = executor.submit(search_thread, start_idx, end_idx) + future.add_done_callback(callback) + futures.append(future) + + concurrent.futures.wait(futures) + progress.close() + + # end_time = time.time() + context_format = """Source #{source_idx}\nDetail: {source_detail}\n""" + contexts_results = [] + with open("query.out", "w") as fout: + for query_idx, query_results in enumerate(results): + fout.write(f"=== Query #{query_idx + 1} Results ===\n") + context = [] + for entry_idx, result in enumerate(query_results): + text = result["text"] + formatted = context_format.format(source_idx=entry_idx, source_detail=text) + context.append(formatted) + fout.write(f"*** Retrieved result #{entry_idx}, doc length: {len(text)}\n") + fout.write("\n") + contexts_results.append(context) + + print(f"***Query search completed.") + return contexts_results + + def query_search_image( + self, + query_vector, + topk, + collection_name=None, + search_batch_size=1, + multithread=False, + max_threads=4, + consistency_level="Eventually", + output_fields=["text", "vector"], + ): + print(f"***Start query search in collection: {collection_name}") + + tbl = self.client.open_table(collection_name) + + total_queries = len(query_vector) + + # Adjust search_batch_size if it exceeds total_queries + if search_batch_size > total_queries: + search_batch_size = total_queries + + results = [None] * total_queries + + num_batches = (total_queries + search_batch_size - 1) // search_batch_size + + def search_thread(start_idx, end_idx): + b_vectors = query_vector[start_idx:end_idx] + + # b_results = tbl.search(b_vectors, vector_column_name='vector').limit(topk).nprobes(3).to_list() + b_results = tbl.search(b_vectors, vector_column_name='vector').limit(topk).to_list() + + results[start_idx:end_idx] = b_results + + # start_time = time.time() + # print(f"*** Start multithreaded search: total={self.retrieval_size}, batch_size={batch_size}, max_threads={max_threads}") + if max_threads == 1 or not multithread: + # Single-threaded search + for i in tqdm(range(num_batches), desc="Searching batches"): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + b_vectors = query_vector[start_idx:end_idx] + b_results = ( + tbl.search(b_vectors, vector_column_name='vector') + .limit(topk) + .nprobes(1) + .to_list() + ) + # b_results = tbl.search(b_vectors, vector_column_name='vector').limit(topk).to_list() + if len(b_results) != len(b_vectors) * topk: + raise ValueError( + f"len(b_results) must be n*topk n = {search_batch_size}, topk {topk}, but got {len(b_results)}" + ) + b_results = [b_results[i * topk : (i + 1) * topk] for i in range(search_batch_size)] + results[start_idx:end_idx] = b_results + else: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = [] + progress = tqdm(total=num_batches, desc="Searching batches") + + def callback(future): + progress.update(1) + + for i in range(num_batches): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + future = executor.submit(search_thread, start_idx, end_idx) + future.add_done_callback(callback) + futures.append(future) + + concurrent.futures.wait(futures) + progress.close() + + # end_time = time.time() + doc_ids = set() + with open("query.out", "w") as fout: + for query_results in results: + for result in query_results: + doc_ids.add(result["doc_id"]) + + print(f"***Query search completed.") + return doc_ids + + def query(self, collection_name, filter_expr, output_fields=None, limit=10): + tbl = self.client.open_table(collection_name) + if output_fields is not None: + query = tbl.search().where(filter_expr).select(output_fields).to_pandas() + else: + query = tbl.search().where(filter_expr).to_pandas() + return query + + def build_index( + self, + collection_name, + index_type, + metric_type, + num_partitions=256, + num_sub_vectors=96, + idx_name=None, + drop_index=True, + device=None, + index_cache_size=None, + ): + # Color print the index metrics used + print(f"Building index with parameters:", "cyan") + print(f" index_type: {index_type}", "green") + + tbl = self.client.open_table(collection_name) + tbl.create_index( + metric=metric_type, + num_partitions=num_partitions, + num_sub_vectors=num_sub_vectors, + vector_column_name='vector', + replace=drop_index, + accelerator=device, + index_cache_size=32, + index_type=index_type, + num_bits=8, + max_iterations=50, + sample_rate=256, + m=20, + ef_construction=300, + ) + + return + + +# test +if __name__ == "__main__": + print("Lance client test") + # change lance path to a local on + lc = lance_client( + db_path="/mnt/nvme1n1/shaobol2/ragdata/lancedb", + collection_name="test_collection", + dim=768, + index_type="IVF_PQ", + metric_type="L2", + ) + + lc.setup() + # lc.drop_collection("test_collection") + # lc.create_collection("test_collection", dim=768) + # test insertion + # dict_list = [] + # for i in range(10000000): + # dict_list.append({ + # "text": f"Sample text {i}", + # "vector": [random.random() for _ in range(768)] # Example vector of size 768 + # }) + # lc.insert_data(dict_list, collection_name="test_collection", insert_batch_size=10, create_collection=False) + # lc.show_table("test_collection") + lc.build_index( + "test_collection", + index_type="IVF_HNSW_PQ", + metric_type="L2", + num_partitions=256, + num_sub_vectors=96, + drop_index=True, + device=None, + index_cache_size=None, + ) + # results = lc.query_search( + # query_vector=[[random.random() for _ in range(768)] for _ in range(2)], + # topk=2, + # collection_name="test_collection", + # search_batch_size=2, + # multithread=False, + # max_threads=4, + # consistency_level="Eventually", + # output_fields=["text", "vector"], + # monitor=True + # ) + # print("Query results:") + # print(results) diff --git a/src/vectordb/lancedb_interactive.py b/src/vectordb/lancedb_interactive.py new file mode 100644 index 0000000..d7cb66e --- /dev/null +++ b/src/vectordb/lancedb_interactive.py @@ -0,0 +1,133 @@ +import os, sys + +script_path = os.path.realpath(__file__) +script_dir = os.path.dirname(script_path) +script_name = os.path.basename(script_path) + +import utils.python_utils as utils +import pprint +import argparse +import inspect + +from pymilvus import MilvusClient + +db_dir = "/mnt/nvme2n1/rag_bench/db" +db_name = "milvus.db" +history_file_dir = script_dir + + +def construct_db_filepath(dir, name): + if dir is None: + dir = db_dir + if name is None: + name = db_name + return os.path.join(dir, name) + + +parser = argparse.ArgumentParser( + prog=script_name, description="Interactive Milvus client for inspect db status" +) +parser.add_argument("-p", "--path") + +saved_local_names = set() +saved_local_names = set(locals().keys()) +# define all helper functions BELOW this line + + +def help(): + """Print predefined helper methods and their descriptions""" + max_local_def_strlen = max((len(local_def) for local_def in local_defs), default=0) + for local_def in local_defs: + func_obj = saved_locals[local_def] + docstring = "" if func_obj.__doc__ is None else func_obj.__doc__ + docstring = os.linesep.join( + [line for line in docstring.splitlines() if len(line.strip()) > 0] + ) + signature = inspect.signature(func_obj) + print(f"{local_def}{signature}: {docstring}") + + +def ls(): + """List all collection and #rows contained""" + collections = mc.list_collections() + max_collection_strlen = max((len(collection) for collection in collections), default=0) + print(f"total {len(collections)}") + for collection in collections: + stats = mc.get_collection_stats(collection) + print(f"""{collection:{max_collection_strlen}s} {stats["row_count"]}""") + + +def stat(name: str): + """ + Get collection properties + + @param name: name of the collection + """ + if name not in mc.list_collections(): + print(f"\"{name}\" is not a valid collection name") + return + print(f"Property:") + pprint.pprint(mc.describe_collection(name)) + + +def reload_db(dir=None, name=None): + """ + Load db from file + + @param dir: directory of db + @param name: name of db + """ + # db_path = construct_db_filepath(dir, name) + # assert os.path.isfile(db_path), f"\"{db_path}\" is not a valid db file" + + # mc = MilvusClient(db_path) + # print(f"Using MilvusClient with db \"{db_path}\"") + mc = MilvusClient(uri="http://localhost:19530", token="root:Milvus") + return mc + + +# define all helper functions ABOVE this line +local_defs = set(locals().keys()) - saved_local_names +saved_locals = locals() + +# load/initialize db +mc = reload_db() + +title = "MilvusClient is in the variable named \"mc\", entering interactive mode" +exitmsg = "Exiting interactive mode" +try: + from ptpython.repl import embed +except ImportError: + history_file = os.path.join(os.path.join(history_file_dir, ".python_history")) + + # start interactive shell + import code + import readline + import rlcompleter + + sys.ps1 = "(mc) >>> " + sys.ps2 = "(mc) ... " + vars = globals() | locals() + readline.set_completer(rlcompleter.Completer(vars).complete) + readline.parse_and_bind("tab: complete") + + try: + readline.read_history_file(history_file) + except Exception: + pass + + try: + code.InteractiveConsole(vars).interact(banner=title, exitmsg=exitmsg) + finally: + readline.write_history_file(history_file) +else: + history_file = os.path.join(os.path.join(history_file_dir, ".ptpython_history")) + + def configure(repl): + repl.vi_mode = True + repl.enable_history_search = True + repl.enable_auto_suggest = True + repl.confirm_exit = False + + embed(globals(), locals(), configure=configure, title=title, history_filename=history_file) + print(exitmsg) diff --git a/src/vectordb/milvus_api.py b/src/vectordb/milvus_api.py new file mode 100644 index 0000000..c31c1db --- /dev/null +++ b/src/vectordb/milvus_api.py @@ -0,0 +1,412 @@ +from pymilvus import MilvusClient +from tqdm import tqdm +import re +import concurrent.futures + +# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +# sys.path.reverse() +from vectordb.DBInstance import DBInstance + + +class milvus_client(DBInstance): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.type = "milvus" + self.db_token = kwargs.get("db_token", "root:Milvus") + + def setup(self): + self.client = MilvusClient(uri=self.db_path, token=self.db_token) + print(f"***Connected to Milvus client at {self.db_path}\n") + # return self.client + + def has_collection(self, collection_name): + if self.client.has_collection(collection_name): + print(f"***Collection: {collection_name} exists.") + return True + else: + print(f"***Collection: {collection_name} does not exist.") + return False + + def create_collection(self, collection_name, dim, consistency_level="Eventually", auto_id=True): + if self.client.has_collection(collection_name): + print(f"***Collection: {collection_name} already exists.") + # load collection + return self.client.load_collection(collection_name) + else: + try: + self.client.create_collection( + collection_name, dim, consistency_level="Eventually", auto_id=True + ) + print( + f"***Created new collection: {collection_name} with consistency_level: {consistency_level}" + ) + return + except Exception as e: + print(f"***Failed to create collection: {collection_name}. Error: {e}") + return + + def drop_collection(self, collection_name): + if not self.client.has_collection(collection_name): + print(f"***Collection: {collection_name} does not exist.") + return + self.client.drop_collection(collection_name) + print(f"***Dropped existing collection: {collection_name}") + + def insert_data_vector( + self, + vector, + chunks, + collection_name=None, + insert_batch_size=1, + strict_check=False, + create_collection=False, + ): + if collection_name is None: + collection_name = self.default_collection + if not self.client.has_collection(collection_name): + if create_collection: + # create_collection first + self.create_collection(collection_name, dim=len(vector[0])) + else: + print(f"***Collection: {collection_name} does not exist. Please create it first.") + return + + total_chunks_num = len(chunks) + total_vectors_num = len(vector) + if total_chunks_num != total_vectors_num and strict_check: + print( + f"***Error: The number of chunks ({total_chunks_num}) does not match the number of vectors ({total_vectors_num})." + ) + return + print(f"***Start insert: {total_chunks_num}") + + for i in tqdm(range(0, total_chunks_num, insert_batch_size), desc="inserting"): + dict_list = [ + {"text": text, "vector": vector} + for text, vector in zip( + chunks[i : i + insert_batch_size], vector[i : i + insert_batch_size] + ) + ] + self.client.insert(collection_name, data=dict_list, progress_bar=False) + + print(f"***Insert done.") + + def insert_data( + self, dict_list, collection_name=None, insert_batch_size=1, create_collection=False + ): + if collection_name is None: + collection_name = self.default_collection + if not self.client.has_collection(collection_name): + if create_collection: + # create_collection first + self.create_collection(collection_name, dim=len(dict_list[0]["vector"])) + else: + print(f"***Collection: {collection_name} does not exist. Please create it first.") + return + + total_chunks_num = len(dict_list) + print(f"***Start insert: {total_chunks_num}") + + for i in tqdm(range(0, total_chunks_num, insert_batch_size), desc="inserting"): + self.client.insert( + collection_name, data=dict_list[i : i + insert_batch_size], progress_bar=False + ) + + print(f"***Insert done.") + + def query_search( + self, + query_vector, + topk, + collection_name=None, + search_batch_size=1, + multithread=False, + max_threads=1, + consistency_level="Eventually", + output_fields=["text", "vector"], + ): + if collection_name is None: + collection_name = self.default_collection + if not self.client.has_collection(collection_name): + print(f"***Collection: {collection_name} does not exist. Please create it first.") + return + + self.client.load_collection(collection_name) + + total_queries = len(query_vector) + results = [None] * total_queries + + num_batches = (total_queries + search_batch_size - 1) // search_batch_size + + def search_thread(start_idx, end_idx): + b_vectors = query_vector[start_idx:end_idx] + b_results = self.client.search( + collection_name, + data=b_vectors, + limit=topk, + consistency_level="Eventually", + output_fields=output_fields, + ) + results[start_idx:end_idx] = b_results + + # start_time = time.time() + # print(f"*** Start multithreaded search: total={self.retrieval_size}, batch_size={batch_size}, max_threads={max_threads}") + if max_threads == 1 or not multithread: + # Single-threaded search + for i in range(num_batches): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + b_vectors = query_vector[start_idx:end_idx] + b_results = self.client.search( + collection_name, + data=b_vectors, + limit=topk, + consistency_level=consistency_level, + output_fields=output_fields, + ) + results[start_idx:end_idx] = b_results + else: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = [] + progress = tqdm(total=num_batches, desc="Searching batches") + + def callback(future): + progress.update(1) + + for i in range(num_batches): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + future = executor.submit(search_thread, start_idx, end_idx) + future.add_done_callback(callback) + futures.append(future) + + concurrent.futures.wait(futures) + progress.close() + + context_format = """Source #{source_idx}\nDetail: {source_detail}\n""" + contexts_results = [] + with open("query.out", "w") as fout: + for query_idx, query_results in enumerate(results): + fout.write(f"=== Query #{query_idx + 1} Results ===\n") + context = [] + for entry_idx, result in enumerate(query_results): + entity = result.get("entity", {}) + detail = re.sub(r"\n+", "\n", entity.get("text", "")) + formatted = context_format.format(source_idx=entry_idx, source_detail=detail) + context.append(formatted) + fout.write( + f"*** Retrieved result #{entry_idx}, id: {result.get('id')}, distance: {result.get('distance'):.4f}, doc length: {len(detail)}\n" + ) + fout.write("\n") + contexts_results.append(context) + + # print(f"*** Milvus limited-thread search time: {round(end_time - start_time, 2)} seconds") + # self.client.release_collection(collection_name) + + # if True: + # self.client.alter_collection_properties( + # collection_name=collection_name, + # properties={ + # "mmap.enabled": True + # } + # ) + + return contexts_results + + def query_search_image( + self, + query_vector, + topk, + collection_name=None, + search_batch_size=1, + multithread=False, + max_threads=1, + consistency_level="Eventually", + output_fields=["text", "vector"], + ): + if collection_name is None: + collection_name = self.default_collection + if not self.client.has_collection(collection_name): + print(f"***Collection: {collection_name} does not exist. Please create it first.") + return + + self.client.load_collection(collection_name) + + total_queries = len(query_vector) + results = [None] * total_queries + + num_batches = (total_queries + search_batch_size - 1) // search_batch_size + + def search_thread(start_idx, end_idx): + b_vectors = query_vector[start_idx:end_idx] + b_results = self.client.search( + collection_name, + data=b_vectors, + limit=topk, + consistency_level="Eventually", + output_fields=output_fields, + ) + results[start_idx:end_idx] = b_results + + # start_time = time.time() + # print(f"*** Start multithreaded search: total={self.retrieval_size}, batch_size={batch_size}, max_threads={max_threads}") + if max_threads == 1 or not multithread: + # Single-threaded search + for i in range(num_batches): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + b_vectors = query_vector[start_idx:end_idx] + b_results = self.client.search( + collection_name, + data=b_vectors, + limit=topk, + consistency_level=consistency_level, + output_fields=output_fields, + ) + results[start_idx:end_idx] = b_results + else: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = [] + progress = tqdm(total=num_batches, desc="Searching batches") + + def callback(future): + progress.update(1) + + for i in range(num_batches): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + future = executor.submit(search_thread, start_idx, end_idx) + future.add_done_callback(callback) + futures.append(future) + + concurrent.futures.wait(futures) + progress.close() + print(f"len results: {len(results)}") + + # get unique doc_id from db search + doc_ids = set() + for r_id in range(len(results)): + for r in range(len(results[r_id])): + doc_ids.add(results[r_id][r]["entity"]["doc_id"]) + + return doc_ids + + def query(self, collection_name, filter_expr, output_fields=["text", "vector"], limit=10): + results = self.client.query( + collection_name=collection_name, + filter_expr=filter_expr, + output_fields=output_fields, + limit=limit, + ) + return results + + def build_index(self, collection_name, index_type, metric_type, idx_name=None, drop_index=True): + if collection_name is None: + collection_name = self.default_collection + if not self.client.has_collection(collection_name): + print(f"***Collection: {collection_name} does not exist. Please create it first.") + return + print(f"***Creating index: {index_type} metics: {metric_type}") + res = self.client.list_indexes(collection_name=collection_name) + + self.client.flush(collection_name=collection_name) + + if drop_index and len(res) > 0: + self.client.release_collection( + collection_name=collection_name, + ) + self.client.drop_index(collection_name=collection_name, index_name=res[0]) + print(f"*** Drop index name: {res[0]}") + + if idx_name is None: + idx_name = f"{index_type}_{metric_type}" + print(f"*** Index name to default: {idx_name}") + print(f"*** Create index name: {idx_name}") + + index_params = self.client.prepare_index_params() + + # 4.2. Add an index on the vector field. + if index_type == "IVF_PQ": + index_params.add_index( + field_name="vector", + metric_type=metric_type, + index_type=index_type, + index_name=idx_name, + params={ + "m": 128, # Number of sub-vectors to split eahc vector into + }, + ) + else: + index_params.add_index( + field_name="vector", + metric_type=metric_type, + index_type=index_type, + index_name=idx_name, + ) + + # 4.3. Create an index file + self.client.create_index(collection_name=collection_name, index_params=index_params) + + # self.client.flush(collection_name=self.collection_name) + + res = self.client.list_indexes(collection_name=collection_name) + + index_describe = self.client.describe_index( + collection_name=collection_name, index_name=res[0] + ) + # self.client.flush(collection_name=collection_name) + print(index_describe) + + +# test +# if __name__ == "__main__": +# client = MilvusClient(uri="http://localhost:19530", token="root:Milvus") +# idx_name= "test" +# collection_name = "wikimedia_wikipedia_all_MiniLM_L6_v2_0_1_512" +# client.release_collection(collection_name=collection_name) +# client.drop_index(collection_name=collection_name, index_name=idx_name) + +# index_params = client.prepare_index_params() + +# index_params.add_index( +# field_name="vector", +# metric_type="L2", +# index_type="GPU_IVF_FLAT", +# index_name=idx_name +# ) +# client.create_index( +# collection_name= collection_name, +# index_params=index_params +# ) +# print(f"***Created index: {idx_name} on collection: {collection_name}") + +# #test search +# # vectors = [[random.random() for i in range(512)] for j in range(10)] # Example vectors + +# # self.client.flush(collection_name=self.collection_name) + +# res = self.client.list_indexes( +# collection_name=collection_name +# ) + +# print("Milvus client test") +# mc = milvus_client( +# db_path="http://localhost:19530", +# collection_name="test_collection", +# dim=768, +# index_type="IVF_PQ", +# metric_type="L2" +# ) + +# mc.setup() +# mc.has_collection("test_collection") +# mc.create_collection("test_collection", dim=768) +# mc.drop_collection("test_collection") +# test insertion +# vectors = [[random.random() for i in range(768)] for j in range(10)] # Example vectors +# print(vectors) +# mc.insert_data(vectors, ["text1", "text2", "text3", "text4", "text5", "text6", "text7", "text8", "text9", "text10"], "test_collection") +# mc.build_index("test_collection", "IVF_PQ", "L2", drop_index=True) +# query_vector = [vectors[0],vectors[3]] # Example query vectors +# results = mc.query_search(query_vector, topk=2, collection_name="test_collection", search_batch_size=2, multithread=True, max_threads=4) +# print("Query results:", results) diff --git a/src/vectordb/milvus_interactive.py b/src/vectordb/milvus_interactive.py new file mode 100644 index 0000000..9198412 --- /dev/null +++ b/src/vectordb/milvus_interactive.py @@ -0,0 +1,265 @@ +import os, sys + +script_path = os.path.realpath(__file__) +script_dir = os.path.dirname(script_path) +script_name = os.path.basename(script_path) + +import utils.python_utils as utils +import pprint +import argparse +import inspect + +from pymilvus import MilvusClient + +db_dir = "/mnt/nvme1n1/rag_bench/db" +db_name = "milvus.db" +history_file_dir = script_dir + + +def construct_db_filepath(dir, name): + if dir is None: + dir = db_dir + if name is None: + name = db_name + return os.path.join(dir, name) + + +parser = argparse.ArgumentParser( + prog=script_name, description="Interactive Milvus client for inspect db status" +) +parser.add_argument("-p", "--path") + +saved_local_names = set() +saved_local_names = set(locals().keys()) +# define all helper functions BELOW this line + + +def help(): + """Print predefined helper methods and their descriptions""" + max_local_def_strlen = max((len(local_def) for local_def in local_defs), default=0) + for local_def in local_defs: + func_obj = saved_locals[local_def] + docstring = "" if func_obj.__doc__ is None else func_obj.__doc__ + docstring = os.linesep.join( + [line for line in docstring.splitlines() if len(line.strip()) > 0] + ) + signature = inspect.signature(func_obj) + print(f"{local_def}{signature}: {docstring}") + + +def ls(): + """List all collection and #rows contained""" + collections = mc.list_collections() + max_collection_strlen = max((len(collection) for collection in collections), default=0) + print(f"total {len(collections)}") + for collection in collections: + stats = mc.get_collection_stats(collection) + print(f"""{collection:{max_collection_strlen}s} {stats["row_count"]}""") + + +def stat(name: str): + """ + Get collection properties + + @param name: name of the collection + """ + if name not in mc.list_collections(): + print(f"\"{name}\" is not a valid collection name") + return + print(f"Property:") + pprint.pprint(mc.describe_collection(name)) + + +def reload_db(dir=None, name=None): + """ + Load db from file + + @param dir: directory of db + @param name: name of db + """ + # db_path = construct_db_filepath(dir, name) + # assert os.path.isfile(db_path), f"\"{db_path}\" is not a valid db file" + # mc = MilvusClient(db_path) + # print(f"Using MilvusClient with db \"{db_path}\"") + mc = MilvusClient(uri="http://localhost:19530", token="root:Milvus") + return mc + + +# define all helper functions ABOVE this line +local_defs = set(locals().keys()) - saved_local_names +saved_locals = locals() + +# load/initialize db +mc = reload_db() + +title = "MilvusClient is in the variable named \"mc\", entering interactive mode" +exitmsg = "Exiting interactive mode" +try: + from ptpython.repl import embed +except ImportError: + history_file = os.path.join(os.path.join(history_file_dir, ".python_history")) + + # start interactive shell + import code + import readline + import rlcompleter + + sys.ps1 = "(mc) >>> " + sys.ps2 = "(mc) ... " + vars = globals() | locals() + readline.set_completer(rlcompleter.Completer(vars).complete) + readline.parse_and_bind("tab: complete") + + try: + readline.read_history_file(history_file) + except Exception: + pass + + try: + code.InteractiveConsole(vars).interact(banner=title, exitmsg=exitmsg) + finally: + readline.write_history_file(history_file) +else: + history_file = os.path.join(os.path.join(history_file_dir, ".ptpython_history")) + + def configure(repl): + repl.vi_mode = True + repl.enable_history_search = True + repl.enable_auto_suggest = True + repl.confirm_exit = False + + embed(globals(), locals(), configure=configure, title=title, history_filename=history_file) + print(exitmsg) + +import os, sys + +script_path = os.path.realpath(__file__) +script_dir = os.path.dirname(script_path) +script_name = os.path.basename(script_path) + +import utils.python_utils as utils +import pprint +import argparse +import inspect + +from pymilvus import MilvusClient + +db_dir = "/mnt/nvme1n1/rag_bench/db" +db_name = "milvus.db" +history_file_dir = script_dir + + +def construct_db_filepath(dir, name): + if dir is None: + dir = db_dir + if name is None: + name = db_name + return os.path.join(dir, name) + + +parser = argparse.ArgumentParser( + prog=script_name, description="Interactive Milvus client for inspect db status" +) +parser.add_argument("-p", "--path") + +saved_local_names = set() +saved_local_names = set(locals().keys()) +# define all helper functions BELOW this line + + +def help(): + """Print predefined helper methods and their descriptions""" + max_local_def_strlen = max((len(local_def) for local_def in local_defs), default=0) + for local_def in local_defs: + func_obj = saved_locals[local_def] + docstring = "" if func_obj.__doc__ is None else func_obj.__doc__ + docstring = os.linesep.join( + [line for line in docstring.splitlines() if len(line.strip()) > 0] + ) + signature = inspect.signature(func_obj) + print(f"{local_def}{signature}: {docstring}") + + +def ls(): + """List all collection and #rows contained""" + collections = mc.list_collections() + max_collection_strlen = max((len(collection) for collection in collections), default=0) + print(f"total {len(collections)}") + for collection in collections: + stats = mc.get_collection_stats(collection) + print(f"""{collection:{max_collection_strlen}s} {stats["row_count"]}""") + + +def stat(name: str): + """ + Get collection properties + + @param name: name of the collection + """ + if name not in mc.list_collections(): + print(f"\"{name}\" is not a valid collection name") + return + print(f"Property:") + pprint.pprint(mc.describe_collection(name)) + + +def reload_db(dir=None, name=None): + """ + Load db from file + + @param dir: directory of db + @param name: name of db + """ + # db_path = construct_db_filepath(dir, name) + # assert os.path.isfile(db_path), f"\"{db_path}\" is not a valid db file" + # mc = MilvusClient(db_path) + # print(f"Using MilvusClient with db \"{db_path}\"") + mc = MilvusClient(uri="http://localhost:19530", token="root:Milvus") + return mc + + +# define all helper functions ABOVE this line +local_defs = set(locals().keys()) - saved_local_names +saved_locals = locals() + +# load/initialize db +mc = reload_db() + +title = "MilvusClient is in the variable named \"mc\", entering interactive mode" +exitmsg = "Exiting interactive mode" +try: + from ptpython.repl import embed +except ImportError: + history_file = os.path.join(os.path.join(history_file_dir, ".python_history")) + + # start interactive shell + import code + import readline + import rlcompleter + + sys.ps1 = "(mc) >>> " + sys.ps2 = "(mc) ... " + vars = globals() | locals() + readline.set_completer(rlcompleter.Completer(vars).complete) + readline.parse_and_bind("tab: complete") + + try: + readline.read_history_file(history_file) + except Exception: + pass + + try: + code.InteractiveConsole(vars).interact(banner=title, exitmsg=exitmsg) + finally: + readline.write_history_file(history_file) +else: + history_file = os.path.join(os.path.join(history_file_dir, ".ptpython_history")) + + def configure(repl): + repl.vi_mode = True + repl.enable_history_search = True + repl.enable_auto_suggest = True + repl.confirm_exit = False + + embed(globals(), locals(), configure=configure, title=title, history_filename=history_file) + print(exitmsg) diff --git a/src/vectordb/milvus_util.py b/src/vectordb/milvus_util.py new file mode 100644 index 0000000..0654d0e --- /dev/null +++ b/src/vectordb/milvus_util.py @@ -0,0 +1,118 @@ +from pymilvus import MilvusClient +import argparse + + +class milvus_util: + def __init__(self): + self.client = MilvusClient(uri="http://localhost:19530", token="root:Milvus") + print(f"***connect to milvus client\n") + + def load_collections(self, collection_name): + self.client.load_collection(collection_name) + print(f"***load collections {collection_name}.") + + def release_collections(self, collection_name): + self.client.release_collection(collection_name) + print(f"***release collections {collection_name}.") + + def release_all_collections(self): + collections = self.client.list_collections() + + for collection_name in collections: + self.client.release_collection(collection_name) + print("***All collections released.") + + def drop_collection(self, collection_name): + self.client.drop_collection(collection_name) + print(f"***Dropped existing collection: {collection_name}") + + def create_collection(self, collection_name, dim): + if self.client.has_collection(collection_name): + print(f"***collection: {collection_name} already have") + return + print(f"***Creating new collection: {collection_name} with consistency_level: Eventually") + + self.client.create_collection( + collection_name, dim, consistency_level="Eventually", auto_id=True + ) + return + + def create_index(self, collection_name): + print(f"***Creating index: {self.index_type} metics: {self.metric_type}") + res = self.client.list_indexes(collection_name=collection_name) + + if len(res) > 0: + self.client.release_collection( + collection_name=collection_name, + ) + self.client.drop_index(collection_name=collection_name, index_name=res[0]) + print(f"*** Drop index name: {res[0]}") + print(f"*** Create index name: {self.index_type}_{self.metric_type}") + + index_params = MilvusClient.prepare_index_params() + + if self.index_type == "IVF_PQ": + index_params.add_index( + field_name="vector", + metric_type=self.metric_type, + index_type=self.index_type, + index_name=f"{self.index_type}_{self.metric_type}", + params={ + "m": 4, # Number of sub-vectors to split each vector into + }, + ) + else: + index_params.add_index( + field_name="vector", + metric_type=self.metric_type, + index_type=self.index_type, + index_name=f"{self.index_type}_{self.metric_type}", + ) + + # 4.3. Create an index file + self.client.create_index(collection_name=collection_name, index_params=index_params) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "--db", type=str, default="http://localhost:19530", help="db client location" + ) + parser.add_argument("--task", type=str, default="none", help="action to the db client") + parser.add_argument("--collection", type=str, default="none", help="collection name") + parser.add_argument("--dim", type=int, default=0, help="dimension of the collection") + parser.add_argument("--index_type", type=str, default="IVF_PQ", help="index type") + parser.add_argument( + "--config", type=str, default="none", help="load collection name from config" + ) + args = parser.parse_args() + + # if args.config != "none": + # config = load_config(args.config) + # if "dataset" in config: + # dataset_cfg.update(config["dataset"]) + # if "pipeline" in config: + # pipeline_cfg.update(config["pipeline"]) + # args.collection = dataset_cfg["collection_name"] + # args.index_type = dataset_cfg["index_type"] + + db_i = milvus_util() + # db_i.client = MilvusClient(uri=args.db, token="root:Milvus") + + switch = { + "load": db_i.load_collections, + "release": db_i.release_collections, + "release_all": db_i.release_all_collections, + "drop": db_i.drop_collection, + "create": db_i.create_collection, + # "create_index": db_i.create_index + } + print("dimension: ", args.dim) + if args.task == "create": + db_i.create_collection(args.collection, args.dim) + elif args.task in switch: + switch[args.task](args.collection) + else: + print(f"***{args.task} is not a valid task") + # db_i.client = MilvusClient(uri="http://localhost:195 diff --git a/src/vectordb/qdrant_api.py b/src/vectordb/qdrant_api.py new file mode 100644 index 0000000..676aa62 --- /dev/null +++ b/src/vectordb/qdrant_api.py @@ -0,0 +1,314 @@ +# took from lancedb_api +import argparse +import sys, os +import random +from tqdm import tqdm +import re +import concurrent.futures +import lancedb + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.reverse() +from vectordb.DBInstance import DBInstance + +# qdrant_api specific +from qdrant_client import QdrantClient, models + +# from qdrant_client.models import Distance, VectorParams + +## deployed form docker +# docker pull qdrant/qdrant +# docker run -p 6333:6333 -p 6334:6334 \ +# -v "${db_path}/qdrant_storage:/qdrant/storage:z" \ +# qdrant/qdrant + + +class qdrant_client(DBInstance): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.id_num = 0 + + def setup(self): + self.client = QdrantClient(url=self.db_path, timeout=200) + print(f"***Connected to Qdrant client at {self.db_path}\n") + + def has_collection(self, collection_name): + if self.client.collection_exists(collection_name=collection_name): + print(f"***Collection: {collection_name} exists.") + return True + else: + print(f"***Collection: {collection_name} does not exist.") + return False + + def create_collection(self, collection_name, dim, consistency_level="Eventually", auto_id=True): + if self.client.collection_exists(collection_name=collection_name): + print(f"***Collection: {collection_name} already exists.") + return + else: + try: + self.client.create_collection( + collection_name=collection_name, + vectors_config=models.VectorParams(size=dim, distance=models.Distance.DOT), + ) + print(f"***Created new collection: {collection_name}") + return + except Exception as e: + print(f"***Failed to create collection: {collection_name}. Error: {e}") + return + + def drop_collection(self, collection_name): + self.client.delete_collection(collection_name=collection_name) + print(f"***Dropped existing collection: {collection_name}") + + def insert_data_vector( + self, + vector, + chunks, + collection_name=None, + insert_batch_size=32, + strict_check=False, + create_collection=False, + ): + if len(vector) != len(chunks): + raise ValueError(f"Vectors length {len(vector)} != Chunks length {len(chunks)}") + + if not self.client.collection_exists(collection_name=collection_name): + self.create_collection(collection_name=collection_name, dim=len(vector[0])) + + # Build list of points, one per record + count = 0 + + total_count = min(len(vector), len(chunks)) + # pbar = tqdm(total=total_count, desc="Inserting batches") + for i in tqdm(range(0, total_count, insert_batch_size), desc="Inserting batches"): + point_list = [] + end_idx = min(i + insert_batch_size, total_count) + for v, c in zip(vector[i:end_idx], chunks[i:end_idx]): + record = models.PointStruct(id=self.id_num, vector=v, payload={"chunk": c}) + self.id_num += 1 + point_list.append(record) + + # print(f"***Start insert: {len(point_list)}") + operation_info = self.client.upsert( + collection_name=collection_name, + wait=True, + points=point_list, + ) + # pbar.update(len(point_list)) + print(f"***Insert done.") + # return result + + def insert_data( + self, dict_list, collection_name=None, insert_batch_size=1, create_collection=False + ): + total_chunks_num = len(dict_list) + print(f"***Start insert: {total_chunks_num}") + + point_list = [] + for dict in dict_list: + record = { + "id": self.id_num, + "payload": {"chunk": dict["text"]}, + "vector": dict["vector"], + } + self.id_num += 1 + point_list.append(record) + + batch_size = 1000 + for i in tqdm(range(0, len(point_list), batch_size)): + batch = point_list[i : i + batch_size] + self.client.upsert(collection_name=collection_name, points=batch, wait=True) + + print(f"***Insert done.") + + # def show_table(self, collection_name=None): + # tbl = self.client.open_table(collection_name) + # print(tbl.to_pandas()) + + def query_search( + self, + query_vector, + topk, + collection_name=None, + search_batch_size=1, + multithread=False, + max_threads=4, + consistency_level="Eventually", + output_fields=["text", "vector"], + monitor=False, + ): + print(f"***Start query search in collection: {collection_name}") + + total_queries = len(query_vector) + + # Adjust search_batch_size if it exceeds total_queries + if search_batch_size > total_queries: + search_batch_size = total_queries + + results = [None] * total_queries + + num_batches = (total_queries + search_batch_size - 1) // search_batch_size + + # def search_thread(start_idx, end_idx): + # b_vectors = query_vector[start_idx:end_idx] + + # # b_results = tbl.search(b_vectors, vector_column_name='vector').limit(topk).nprobes(3).to_list() + # b_results = self.client.query_points(collection_name=collection_name, query=b_vectors, with_payload=False, limit=topk).points + + # results[start_idx:end_idx] = b_results + + # start_time = time.time() + # print(f"*** Start multithreaded search: total={self.retrieval_size}, batch_size={batch_size}, max_threads={max_threads}") + if max_threads == 1 or not multithread: + # Single-threaded search + for i in tqdm(range(total_queries), desc="Searching batches"): + start_idx = i * search_batch_size + end_idx = min(start_idx + search_batch_size, total_queries) + + b_vectors = [] + for vec in range(start_idx, end_idx): + b_vectors.append( + models.QueryRequest(query=query_vector[vec], limit=topk, with_payload=True) + ) + # b_results = ( + # self.client.query_points(collection_name=collection_name, query=b_vectors, limit=topk) + # .nprobes(1) + # .to_list() + # ) + # results[start_idx:end_idx] = b_results + + results[start_idx:end_idx] = self.client.query_batch_points( + collection_name=collection_name, requests=b_vectors + ) + # results[i] = self.client.query_points(collection_name=collection_name, query=query_vector[i], limit=topk) + else: + # with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + # futures = [] + # progress = tqdm(total=num_batches, desc="Searching batches") + + # def callback(future): + # progress.update(1) + + # for i in range(num_batches): + # start_idx = i * search_batch_size + # end_idx = min(start_idx + search_batch_size, total_queries) + # future = executor.submit(search_thread, start_idx, end_idx) + # future.add_done_callback(callback) + # futures.append(future) + + # concurrent.futures.wait(futures) + # progress.close() + print(("DEFAULT MULTITHREADING")) + + # print(results) + + # end_time = time.time() + context_format = """Source #{source_idx}\nDetail: {source_detail}\n""" + contexts_results = [] + with open("query.out", "w") as fout: + for query_idx, query_results in enumerate(results): + fout.write(f"=== Query #{query_idx + 1} Results ===\n") + context = [] + + for entry_idx, result in enumerate(query_results): + text = result[1][entry_idx].payload['chunk'] + formatted = context_format.format(source_idx=entry_idx, source_detail=text) + context.append(formatted) + fout.write(f"*** Retrieved result #{entry_idx}, doc length: {len(text)}\n") + fout.write("\n") + contexts_results.append(context) + + print(f"***Query search completed.") + return contexts_results + + def build_index( + self, + collection_name, + index_type, + metric_type, + num_partitions=256, + num_sub_vectors=96, + idx_name=None, + drop_index=True, + device=None, + index_cache_size=None, + ): + # Color print the index metrics used + print(f"Building index with parameters:", "cyan") + print(f" index_type: {index_type}", "green") + + # tbl = self.client.open_table(collection_name) + # tbl.create_index( + # metric=metric_type, + # num_partitions=num_partitions, + # num_sub_vectors=num_sub_vectors, + # vector_column_name='vector', + # replace=drop_index, + # accelerator=device, + # index_cache_size=32, + # index_type=index_type, + # num_bits=8, + # max_iterations=50, + # sample_rate=256, + # m=20, + # ef_construction=300, + # ) + + return + + +# test +if __name__ == "__main__": + print("Qdrant client test") + # change qdrant path to a local on + + qc = qdrant_client( + db_path="localhost:6333", + collection_name="test_collection", + dim=768, + index_type="IVF_PQ", + metric_type="L2", + ) + + qc.setup() + qc.create_collection("test_collection", dim=768) + # lc.drop_collection("test_collection") + # lc.create_collection("test_collection", dim=768) + # test insertion + dict_list = [] + for i in range(1000): + dict_list.append( + { + "text": f"Sample text {i}", + "vector": [random.random() for _ in range(768)], # Example vector of size 768 + } + ) + + qc.insert_data( + dict_list, collection_name="test_collection", insert_batch_size=10, create_collection=True + ) + # lc.show_table("test_collection") + # qc.build_index( + # "test_collection", + # index_type="IVF_HNSW_PQ", + # metric_type="L2", + # num_partitions=256, + # num_sub_vectors=96, + # drop_index=True, + # device=None, + # index_cache_size=None, + # ) + results = qc.query_search( + query_vector=[[random.random() for _ in range(768)] for _ in range(2)], + topk=2, + collection_name="test_collection", + search_batch_size=2, + multithread=False, + max_threads=4, + consistency_level="Eventually", + output_fields=["text", "vector"], + monitor=True, + ) + + print("Query results:") + print(results) diff --git a/tests/dataset_test.py b/tests/dataset_test.py new file mode 100644 index 0000000..fc5af51 --- /dev/null +++ b/tests/dataset_test.py @@ -0,0 +1,115 @@ +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from datasetLoader.TextDatasetLoader import TextDatasetLoader +from datasetLoader.PDFDatasetLoader import PDFDatasetLoader +from datasetPreprocess.PDFDatasetPreprocess import PDFDatasetPreprocess +from datasetPreprocess.TextDatasetPreprocess import TextDatasetPreprocess +from encoder.sentenceTransformerEncoder import SentenceTransformerEncoder + +from vectordb.milvus_api import milvus_client + + +def text_test(): + # Simple hard-coded test; + + # Get 1024 wiki doc texts + length = 1024 + slice_id = 0 + + loader = TextDatasetLoader(dataset_name="wikimedia/wikipedia") + df = loader.get_dataset_slice(length=length, slice_id=slice_id) + + print(df.shape) + print(df.head(3)) # peek original dataset + + # Chunk all the doc we get + chunker = TextDatasetPreprocess() + chunked_texts = chunker.chunking_text_to_text(df) + + print(len(chunked_texts)) + for i in range(3): + print(chunked_texts[i]) + + # Embedding chunked texts + Embedder = SentenceTransformerEncoder( + device="cuda:0", sentence_transformers_name="all-MiniLM-L6-v2" + ) + vectors = Embedder.embedding(chunked_texts) + + print(vectors.shape) + print(vectors[0]) + + +def pdf_test(): + # Simple hard-coded test; change as you wish + length = 16 + slice_id = 2 + + loader = PDFDatasetLoader(dataset_name="common-pile/arxiv_papers") + + loader.download_pdf(100) + + df = loader.get_dataset_slice(length=length, offset=slice_id) + + print(df.head(3)) # peek + print(df.shape) + + # Chunk all the doc we get + chunker = PDFDatasetPreprocess() + chunked_texts = chunker.chunking_PDF_to_text(df) + + print(len(chunked_texts)) + for i in range(3): + print(chunked_texts[i]) + + # Embedding chunked texts + Embedder = SentenceTransformerEncoder( + device="cuda:0", sentence_transformers_name="all-MiniLM-L6-v2" + ) + vectors = Embedder.embedding(chunked_texts) + + print(vectors[0]) + + collection_name = "pdf_test_collection" + # connect to db_client + # vector_db: + # # collection_name: 'wikimedia_wikipedia_all_MiniLM_L6_v2_1' + # collection_name: 'wikimedia_wikipedia_all_MiniLM_L6_v2_0_1_512' #IVF_FLAT + # # collection_name: 'wikimedia_wikipedia_All_mpnet_base_v2_0_1_512' #DISKANN + # # collection_name: 'wikimedia_wikipedia_Alibaba_NLP_gte_large_en_v1_5_0_1_512' #GPU_IVF + # db_path: http://localhost:19530 + # db_token: root:Milvus + # drop_previous_collection: false + # type: milvus + db_client = milvus_client( + db_path="http://localhost:19530", + db_token="root:Milvus", + collection_name=collection_name, + drop_previous_collection=False, + # dim=config["sys"]["vector_db"]["dim"], + index_type="GPU_IVF_FLAT", + metric_type="L2", + ) + + db_client.setup() + # db_client.create_collection(collection_name=collection_name, dim=Embedder.dim) + + db_client.insert_data_vector( + vector=vectors, + chunks=chunked_texts, + collection_name=collection_name, + insert_batch_size=4, + create_collection=True, + ) + + db_client.build_index( + collection_name=collection_name, + index_type="GPU_IVF_FLAT", + metric_type="L2", + ) + + +if __name__ == "__main__": + pdf_test() diff --git a/tests/pipeline_test.py b/tests/pipeline_test.py new file mode 100644 index 0000000..cefefdf --- /dev/null +++ b/tests/pipeline_test.py @@ -0,0 +1,31 @@ +from RAGPipeline.TextsRAGPipline import TextsRAGPipeline +from RAGRequest.TextsRAGRequest import WikipediaRequests +from retriever.BaseRetriever import BaseRetriever +from vectordb.milvus_api import milvus_client + + +def pdf_test(): + + collection_name = "pdf_test_collection" + db_client = milvus_client( + db_path="http://localhost:19530", + db_token="root:Milvus", + collection_name=collection_name, + drop_previous_collection=False, + # dim=config["sys"]["vector_db"]["dim"], + index_type="GPU_IVF_FLAT", + metric_type="L2", + ) + + db_client.setup() + + Retriever = BaseRetriever(collection_name=collection_name, client=db_client) + RAGPipline = TextsRAGPipeline(retriever=Retriever) + RAGRequest = WikipediaRequests( + run_name="default_run", collection_name=collection_name, req_type="query", req_count=4 + ) + RAGPipline.process(RAGRequest) + + +if __name__ == "__main__": + pdf_test() diff --git a/tests/simple_example.py b/tests/simple_example.py new file mode 100644 index 0000000..35189b0 --- /dev/null +++ b/tests/simple_example.py @@ -0,0 +1,82 @@ +import sys +import os + +from datasetLoader.TextDatasetLoader import TextDatasetLoader +from datasetLoader.PDFDatasetLoader import PDFDatasetLoader +from datasetPreprocess.PDFDatasetPreprocess import PDFDatasetPreprocess +from datasetPreprocess.TextDatasetPreprocess import TextDatasetPreprocess +from encoder.sentenceTransformerEncoder import SentenceTransformerEncoder + +from RAGPipeline.TextsRAGPipline import TextsRAGPipeline +from RAGRequest.TextsRAGRequest import WikipediaRequests +from RAGPipeline.retriever.BaseRetriever import BaseRetriever +from vectordb.milvus_api import milvus_client + + +def insert_pdf(): + + length = 16 + slice_id = 2 + + loader = PDFDatasetLoader(dataset_name="common-pile/arxiv_papers") + loader.download_pdf(100) + + df = loader.get_dataset_slice(length=length, offset=slice_id) + + chunker = PDFDatasetPreprocess() + chunked_texts = chunker.chunking_PDF_to_text(df) + + Embedder = SentenceTransformerEncoder( + device="cuda:0", sentence_transformers_name="all-MiniLM-L6-v2" + ) + vectors = Embedder.embedding(chunked_texts) + + collection_name = "pdf_test_collection" + + db_client = milvus_client( + db_path="http://localhost:19530", + db_token="root:Milvus", + collection_name=collection_name, + drop_previous_collection=False, + # dim=config["sys"]["vector_db"]["dim"], + index_type="GPU_IVF_FLAT", + metric_type="L2", + ) + + db_client.setup() + + db_client.insert_data_vector( + vector=vectors, + chunks=chunked_texts, + collection_name=collection_name, + insert_batch_size=4, + create_collection=True, + ) + + return + + +def query_pdf(): + collection_name = "pdf_test_collection" + db_client = milvus_client( + db_path="http://localhost:19530", + db_token="root:Milvus", + collection_name=collection_name, + drop_previous_collection=False, + # dim=config["sys"]["vector_db"]["dim"], + index_type="GPU_IVF_FLAT", + metric_type="L2", + ) + db_client.setup() + + Retriever = BaseRetriever(collection_name=collection_name, client=db_client) + RAGPipline = TextsRAGPipeline(retriever=Retriever) + RAGRequest = WikipediaRequests( + run_name="default_run", collection_name=collection_name, req_type="query", req_count=4 + ) + RAGPipline.process(RAGRequest) + + +if __name__ == "__main__": + insert_pdf() + query_pdf() From f1c70e041d3fe516ac708d17863e6c79d6d9fea3 Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sat, 10 Jan 2026 11:23:53 -0600 Subject: [PATCH 02/23] change dir structure --- req.txt => resource/example_req.txt | 0 parser.py => script/parser.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename req.txt => resource/example_req.txt (100%) rename parser.py => script/parser.py (100%) diff --git a/req.txt b/resource/example_req.txt similarity index 100% rename from req.txt rename to resource/example_req.txt diff --git a/parser.py b/script/parser.py similarity index 100% rename from parser.py rename to script/parser.py From b741324f22f38235de362336b3254ee20e33a682 Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sat, 10 Jan 2026 11:25:47 -0600 Subject: [PATCH 03/23] add license --- LICENSE | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..57bc88a --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + From 4b3720e0690199173dd86f22f8dccf12b8125da7 Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sat, 10 Jan 2026 11:42:32 -0600 Subject: [PATCH 04/23] restore format check CI. --- .clang-format-ignore | 7 +++ .github/actions/cmake-setup/action.yml | 69 ++++++++++++++++++++++ .github/workflows/black-format.yml | 80 ++++++++++++++++++++++++++ .github/workflows/clang-format.yml | 68 ++++++++++++++++++++++ .gitignore | 22 +++++++ .gitmodules | 13 +++++ README.md | 16 +++--- 7 files changed, 267 insertions(+), 8 deletions(-) create mode 100644 .clang-format-ignore create mode 100644 .github/actions/cmake-setup/action.yml create mode 100644 .github/workflows/black-format.yml create mode 100644 .github/workflows/clang-format.yml create mode 100644 .gitignore create mode 100644 .gitmodules diff --git a/.clang-format-ignore b/.clang-format-ignore new file mode 100644 index 0000000..a04dce8 --- /dev/null +++ b/.clang-format-ignore @@ -0,0 +1,7 @@ +# apart from all the .gitignores, what else files should be +# ignored by the clang formatter +.git/ +.github/ +third_party/* +*/generated/* +build/* \ No newline at end of file diff --git a/.github/actions/cmake-setup/action.yml b/.github/actions/cmake-setup/action.yml new file mode 100644 index 0000000..7f6ed7c --- /dev/null +++ b/.github/actions/cmake-setup/action.yml @@ -0,0 +1,69 @@ +name: "Setup CMake" +description: "Download/cache specific CMake version" + +inputs: + cmake-version: + description: 'CMake version' + required: true + cmake-cache-path: + description: 'Path to cache CMake' + required: false + default: ~/.cache +outputs: + cmake-path: + description: 'Path to the CMake executable' + value: ${{ steps.add-cmake-to-path.outputs.cmake-path }} + +runs: + using: "composite" + steps: + - name: Setup Variables + id: variables + run: | + CMAKE_VERSION="${{ inputs.cmake-version }}" + CMAKE_COMPRESSED="cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" + CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_COMPRESSED}" + CMAKE_SHA256_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-SHA-256.txt" + CMAKE_CACHE_PATH="${{ inputs.cmake-cache-path }}/cmake-${CMAKE_VERSION}" + + echo "cmake-version=${CMAKE_VERSION}" >> "${GITHUB_OUTPUT}" + echo "cmake-compressed=${CMAKE_COMPRESSED}" >> "${GITHUB_OUTPUT}" + echo "cmake-url=${CMAKE_URL}" >> "${GITHUB_OUTPUT}" + echo "cmake-sha256-url=${CMAKE_SHA256_URL}" >> "${GITHUB_OUTPUT}" + echo "cmake-cache-path=${CMAKE_CACHE_PATH}" >> "${GITHUB_OUTPUT}" + shell: bash + + - name: Cache CMake + id: cache-cmake + uses: actions/cache@v3 + with: + path: ${{ steps.variables.outputs.cmake-cache-path }} + key: ${{ runner.os }}-cmake-${{ inputs.cmake-version }} + + - name: Download & Extract CMake + if: steps.cache-cmake.outputs.cache-hit != 'true' + run: | + # Download CMake and its SHA256 file + curl -L "${{ steps.variables.outputs.cmake-url }}" -o "${{ steps.variables.outputs.cmake-compressed }}" + curl -L "${{ steps.variables.outputs.cmake-sha256-url }}" -o cmake.sha256 + + # Extract the expected SHA256 from the file + EXPECTED_SHA=$( + grep "${{ steps.variables.outputs.cmake-compressed }}" "cmake.sha256" | + awk '{ print $1 }' + ) + + # Verify the SHA256 checksum + echo "${EXPECTED_SHA} ${{ steps.variables.outputs.cmake-compressed }}" | sha256sum -c - + + # Create the cache directory and extract CMake + mkdir -p ${{ steps.variables.outputs.cmake-cache-path }} + tar -xzf "${{ steps.variables.outputs.cmake-compressed }}" --strip-components=1 -C ${{ steps.variables.outputs.cmake-cache-path }} + shell: bash + + - name: Add CMake to PATH & Export CMake Executable Path + id: add-cmake-to-path + run: | + echo "${{ steps.variables.outputs.cmake-cache-path }}/bin" >> "${GITHUB_PATH}" + echo "cmake-path=${{ steps.variables.outputs.cmake-cache-path }}/bin/cmake" >> "${GITHUB_OUTPUT}" + shell: bash diff --git a/.github/workflows/black-format.yml b/.github/workflows/black-format.yml new file mode 100644 index 0000000..fb31f0e --- /dev/null +++ b/.github/workflows/black-format.yml @@ -0,0 +1,80 @@ +name: Python Format Check + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + cmake-version: "3.22.6" + python-version: "3.10" + +jobs: + build: + # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac. + # You can convert this to a matrix build if you need cross-platform coverage. + # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + # === Get CMake === + - name: Setup CMake + id: setup-cmake + uses: ./.github/actions/cmake-setup + with: + cmake-version: ${{ env.cmake-version }} + + # === Get Python3 === + - name: Set up Python3 + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python-version }} + + # === CMake configuration === + - name: Initialize Build System + env: + cmake-exe: ${{ steps.setup-cmake.outputs.cmake-path }} + run: | + ${{ env.cmake-exe }} -B ${{github.workspace}}/build -DFORMATTING_ONLY=ON + ${{ env.cmake-exe }} --build ${{github.workspace}}/build --target help + + # === Check for CMake format check target === + - name: Check CMake Format Target + id: check-cmake-format-target + env: + cmake-exe: ${{ steps.setup-cmake.outputs.cmake-path }} + run: | + # skip download if target is found + if ${{ env.cmake-exe }} --build ${{github.workspace}}/build --target help | grep -q "python-check-format"; then + echo "black-found=true" >> "${GITHUB_OUTPUT}" + else + echo "black-found=false" >> "${GITHUB_OUTPUT}" + fi + + # === Get Python3 black formatter === + - name: Get Python3 Black Formatter + if: steps.check-cmake-format-target.outputs.black-found != 'true' + env: + cmake-exe: ${{ steps.setup-cmake.outputs.cmake-path }} + run: | + # install_black_py3pkg_requirements target will only be available if the black package is not found + ${{ env.cmake-exe }} --build ${{github.workspace}}/build --target install_black_py3pkg_requirements + + # === Redo the CMake configuration === + - name: Initialize Build System Again + if: steps.check-cmake-format-target.outputs.black-found != 'true' + env: + cmake-exe: ${{ steps.setup-cmake.outputs.cmake-path }} + run: | + ${{ env.cmake-exe }} -B ${{github.workspace}}/build -DFORMATTING_ONLY=ON + ${{ env.cmake-exe }} --build ${{github.workspace}}/build --target help + + # === Format check === + - name: Python Format Check + env: + cmake-exe: ${{ steps.setup-cmake.outputs.cmake-path }} + run: | + ${{ env.cmake-exe }} --build ${{github.workspace}}/build --target python-check-format diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml new file mode 100644 index 0000000..e28d323 --- /dev/null +++ b/.github/workflows/clang-format.yml @@ -0,0 +1,68 @@ +name: C/C++ Format Check + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + cmake-version: "3.22.6" + clang-format-version: "14" + +jobs: + build: + # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac. + # You can convert this to a matrix build if you need cross-platform coverage. + # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + # === Get CMake === + - name: Setup CMake + id: setup-cmake + uses: ./.github/actions/cmake-setup + with: + cmake-version: ${{ env.cmake-version }} + + # === Get clang-format === + - name: Cache clang-format + uses: actions/cache@v3 + id: cache-clang-format + with: + path: ~/.cache/clang-format-${{ env.clang-format-version }} + key: ${{ runner.os }}-clang-format-${{ env.clang-format-version }} + + - name: Download clang-format + if: steps.cache-clang-format.outputs.cache-hit != 'true' + env: + clang-format-url: https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-2da3e7b/clang-format-${{ env.clang-format-version }}_linux-amd64 + clang-format-cache-path: ~/.cache/clang-format-${{ env.clang-format-version }} + clang-format-sha256: 5daf48b8331afb85575e11dfd73ffc6bf47af10ccde260b40751df246f1bf1ff + run: | + curl -L ${{ env.clang-format-url }} -o clang-format + echo "${{ env.clang-format-sha256 }} clang-format" | sha256sum -c - + chmod +x clang-format + mkdir -p ${{ env.clang-format-cache-path }} + mv clang-format ${{ env.clang-format-cache-path }}/clang-format-${{ env.clang-format-version }} + + - name: Add clang-format to PATH + run: echo ~/.cache/clang-format-${{ env.clang-format-version }} >> "$GITHUB_PATH" + + # === CMake configuration and build === + - name: CMake Setup + env: + cmake-exe: ${{ steps.setup-cmake.outputs.cmake-path }} + run: | + ${{ env.cmake-exe }} -B ${{github.workspace}}/build -DFORMATTING_ONLY=ON + ${{ env.cmake-exe }} --build ${{github.workspace}}/build --target help + + - name: Format Check + env: + cmake-exe: ${{ steps.setup-cmake.outputs.cmake-path }} + run: | + which clang-format + ${{ env.cmake-exe }} --build ${{github.workspace}}/build --target cpp-check-format + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0a53e75 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +# editor specific +.vscode/ + +# build directory +build/ + +# any database +*.db + +# python +.python_history +.ptpython_history +**/__pycache__/ +requirements.txt + +# build artifacts +build/ + +# dataset data +*.csv +*.arrow +*.json diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..9d811ef --- /dev/null +++ b/.gitmodules @@ -0,0 +1,13 @@ +[submodule "third_party/pybind11"] + path = third_party/pybind11 + url = https://github.com/pybind/pybind11.git +[submodule "third_party/protobuf"] + path = third_party/protobuf + url = https://github.com/protocolbuffers/protobuf.git +[submodule "third_party/date"] + path = third_party/date + url = https://github.com/HowardHinnant/date.git +[submodule "third_party/pybind11_mkdoc"] + path = third_party/pybind11_mkdoc + url = https://github.com/0-EricZhou-0/pybind11_mkdoc.git + branch = arg_support diff --git a/README.md b/README.md index 7f5bdc0..6188e43 100644 --- a/README.md +++ b/README.md @@ -32,12 +32,12 @@ ## Table of Contents - [RASB: RAG-based AI System Benchmakring Framework](#rasb-rag-based-ai-system-benchmakring-framework) - - [Unique Features](#unique-features) + - [Features](#features) - [Installation](#installation) - [1) Create a virtual environment](#1-create-a-virtual-environment) - [2) Python dependencies](#2-python-dependencies) - [3) Install monitor system](#3-install-monitor-system) - - [Usage](#usage) + - [Running RASB](#running-rasb) - [Quick Start with Web UI](#quick-start-with-web-ui) - [1) Preparation](#1-preparation) - [2) Config your Benchmark and run](#2-config-your-benchmark-and-run) @@ -51,9 +51,9 @@ - [Customized Modules](#customized-modules) ## Installation -We highly recommend using an isolated Python environment (Conda). ### 1) Create a virtual environment +To run RASB, we highly recommend using an isolated Python environment (e.g., Conda). **Conda (recommended)** ```bash @@ -81,7 +81,7 @@ python3 -m pip install -r ../requirement.txt RASB uses a custom, low-overhead monitoring daemon. Please refer to the documentations at [MonitoringSystem README](monitoring_sys/README.md) for compilation and installation instructions. -## Usage +## Running RASB RASB provides an Interactive Web UI for ease of use. Or you can use the Command Line (CLI) for automation. ### Quick Start with Web UI @@ -120,12 +120,12 @@ export HF_HOME="/mnt/data/hf_home" ``` #### 2) Running the Benchmark -To run the benchmark, we first need to setup the retriever like a vectorDB. See [vectordb](#vectordb). Change the db_path to your local vectordb path in config file. +To run the benchmark, you first need to setup the vectorDB as the retriever. See [vectordb](#vectordb) for a supported list and quick setup guide. Change the db_path to your local vectordb path in config file. ``` vector_db: db_path: /mnt/data/vectordb ``` -First run the **preprocess/insert** phase to insert the dataset. +First run the **preprocess/insert** phase to insert the dataset: ```bash # 1) Build/insert into the vector store (LanceDB example) @@ -133,9 +133,9 @@ python3 src/run_new.py \ --config config/lance_insert.yaml \ --msys-config config/monitor/example_config.yaml ``` -To execute the **query/evaluate**, run the following: +After the insertion stage, proceed to the **query/evaluate** stage. Run the following: ```bash -# 2) Retreival and Query +# 2) Retreival and Query python3 src/run_new.py \ --config config/lance_query.yaml \ --msys-config config/monitor/example_config.yaml From 9e6f0f64021bab2fa7af6c93e41b99860962ca64 Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sat, 10 Jan 2026 11:49:38 -0600 Subject: [PATCH 05/23] fix CI bage link to current repo. --- README.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6188e43..5abf846 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ **RASB** is an open-source framework designed to benchmark the End-to-End system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. -[![C/C++ Format Check](https://github.com/IOScience/RAGPipeline/actions/workflows/clang-format.yml/badge.svg)](https://github.com/IOScience/RAGPipeline/actions/workflows/clang-format.yml) -[![Python Format Check](https://github.com/IOScience/RAGPipeline/actions/workflows/black-format.yml/badge.svg)](https://github.com/IOScience/RAGPipeline/actions/workflows/black-format.yml) +[![C/C++ Format Check](https://github.com/platformxlab/RAGPerf/actions/workflows/clang-format.yml/badge.svg)](https://github.com/platformxlab/RAGPerf/actions/workflows/clang-format.yml) +[![Python Format Check](https://github.com/platformxlab/RAGPerf/actions/workflows/black-format.yml/badge.svg)](https://github.com/platformxlab/RAGPerf/actions/workflows/black-format.yml) ![CMake](https://img.shields.io/badge/CMake-008fba.svg?style=flat&logo=cmake&logoColor=ffffff) @@ -48,7 +48,6 @@ - [Supported RAG Pipeline Modules](#supported-rag-pipeline-modules) - [VectorDB](#vectordb) - [Monitoring System](#monitoring-system) - - [Customized Modules](#customized-modules) ## Installation @@ -58,8 +57,8 @@ To run RASB, we highly recommend using an isolated Python environment (e.g., Con **Conda (recommended)** ```bash # Install Miniconda/Mambaforge from the official site if you don't have Conda -conda create -n ragbench python=3.10 -conda activate ragbench +conda create -n rasb python=3.10 +conda activate rasb ``` ### 2) Python dependencies @@ -156,6 +155,3 @@ Want to add a new DB? Check our RASB API at [VectorDB API](src/vectordb/README.m ### Monitoring System Examples of how to use it is documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md) - -### Customized Modules - From ec49b94eb49a95c54ebdf41ba5400d963261e2cf Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sat, 10 Jan 2026 12:03:19 -0600 Subject: [PATCH 06/23] update README --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5abf846..53956eb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# RASB: RAG-based AI System Benchmakring Framework +# RASB: RAG-based AI System Benchmarking Framework **RASB** is an open-source framework designed to benchmark the End-to-End system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. @@ -14,13 +14,13 @@ [![Code style: clang-format](https://img.shields.io/badge/C/C++_Code_Style-clang--format-2a3e50?style=flat&logo=llvm&logoColor=cccccc)](resource/clang_format/.clang-format) [![Code style: black](https://img.shields.io/badge/Python_Code_Style-black-000000?style=flat&logo=black&logoColor=ffffff)](resource/black_format/.black-format) -## Features +## Key Features -**🚀 Holistic System-Centric Benchmarking**: RASB moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdown (retrieval vs. generation), and hardware efficiency, helping you identify whether a bottleneck lies in I/O-bound retrieval or compute-bound prefill/decoding stages. +**🚀 Holistic System-Centric Benchmarking**: RASB moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdown, and hardware efficiency, helping you identify whether a bottleneck lies in I/O-bound retrieval or compute-bound prefill/decoding stages. -**🧩 Modular Architecture**: RASB employs a configuration-driven design that abstracts the entire RAG pipeline—Embedding, Vector Database, Reranking, and Generation—behind uniform interfaces. You can seamlessly swap components—switching from Milvus to LanceDB, or from vLLM to OpenAI APIs—without rewriting code. This enables fine-grained analysis of specific component trade-offs. +**🧩 Modular Architecture**: RASB employs a configuration-driven design that abstracts the entire RAG pipeline—Embedding, Vector Database, Reranking, and Generation—behind uniform interfaces. You can seamlessly swap components—switching from Milvus to LanceDB, or from ChatGPT to Qwen—without rewriting code. This enables fine-grained analysis of specific component trade-offs. -**📊 Detailed Full-Stack Profiling**: RASB integrates a lightweight system profiler that runs as a background daemon. It captures granular hardware metrics with minimal overhead, including GPU/CPU utilization, memory hierarchy pressure (host RAM vs. GPU VRAM), PCIe throughput, and Disk I/O. This allows for deep analysis of resource contention between the VectorDB and LLM. +**📊 Detailed Full-Stack Profiling**: RASB integrates a lightweight system profiler that runs as a background daemon. It captures granular hardware metrics with minimal overhead, including GPU/CPU utilization, memory usage (host RAM vs. GPU VRAM), PCIe throughput, and Disk I/O. This allows for deep analysis of resource contention between RAG components. **🔄 Dynamic Workload Generation**: Simulates the evolution of real-world knowledge bases. The workload generator can interleave standard search queries with insert, update, and delete operations. This allows you to stress-test how a RAG system handles high-concurrency requests while maintaining data freshness. @@ -31,8 +31,8 @@ ## Table of Contents -- [RASB: RAG-based AI System Benchmakring Framework](#rasb-rag-based-ai-system-benchmakring-framework) - - [Features](#features) +- [RASB: RAG-based AI System Benchmarking Framework](#rasb-rag-based-ai-system-benchmarking-framework) + - [Key Features](#key-features) - [Installation](#installation) - [1) Create a virtual environment](#1-create-a-virtual-environment) - [2) Python dependencies](#2-python-dependencies) From a0c70ee4f42664e1b5da85a172ce7de01c6d48ad Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sat, 10 Jan 2026 13:03:32 -0600 Subject: [PATCH 07/23] modify benchmark name --- README.md | 36 ++++++++++++++++++------------------ config/README.md | 2 +- src/vectordb/README.md | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 53956eb..d5014bd 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# RASB: RAG-based AI System Benchmarking Framework +# RAGPerf: RAG-based AI System Benchmarking Framework -**RASB** is an open-source framework designed to benchmark the End-to-End system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. +**RAGPerf** is an open-source framework designed to benchmark the End-to-End system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. [![C/C++ Format Check](https://github.com/platformxlab/RAGPerf/actions/workflows/clang-format.yml/badge.svg)](https://github.com/platformxlab/RAGPerf/actions/workflows/clang-format.yml) @@ -16,28 +16,28 @@ ## Key Features -**🚀 Holistic System-Centric Benchmarking**: RASB moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdown, and hardware efficiency, helping you identify whether a bottleneck lies in I/O-bound retrieval or compute-bound prefill/decoding stages. +**🚀 Holistic System-Centric Benchmarking**: RAGPerf moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdown, and hardware efficiency, helping you identify whether a bottleneck lies in I/O-bound retrieval or compute-bound prefill/decoding stages. -**🧩 Modular Architecture**: RASB employs a configuration-driven design that abstracts the entire RAG pipeline—Embedding, Vector Database, Reranking, and Generation—behind uniform interfaces. You can seamlessly swap components—switching from Milvus to LanceDB, or from ChatGPT to Qwen—without rewriting code. This enables fine-grained analysis of specific component trade-offs. +**🧩 Modular Architecture**: RAGPerf employs a configuration-driven design that abstracts the entire RAG pipeline—Embedding, Vector Database, Reranking, and Generation—behind uniform interfaces. You can seamlessly swap components—switching from Milvus to LanceDB, or from ChatGPT to Qwen—without rewriting code. This enables fine-grained analysis of specific component trade-offs. -**📊 Detailed Full-Stack Profiling**: RASB integrates a lightweight system profiler that runs as a background daemon. It captures granular hardware metrics with minimal overhead, including GPU/CPU utilization, memory usage (host RAM vs. GPU VRAM), PCIe throughput, and Disk I/O. This allows for deep analysis of resource contention between RAG components. +**📊 Detailed Full-Stack Profiling**: RAGPerf integrates a lightweight system profiler that runs as a background daemon. It captures granular hardware metrics with minimal overhead, including GPU/CPU utilization, memory usage (host RAM vs. GPU VRAM), PCIe throughput, and Disk I/O. This allows for deep analysis of resource contention between RAG components. **🔄 Dynamic Workload Generation**: Simulates the evolution of real-world knowledge bases. The workload generator can interleave standard search queries with insert, update, and delete operations. This allows you to stress-test how a RAG system handles high-concurrency requests while maintaining data freshness. -**🖼️ Multi-Modal Capabilities**: RASB supports diverse data modalities beyond plain text. It includes specialized pipelines for Visual RAG (PDFs, Images) using OCR or ColPali visual embeddings, and Audio RAG using ASR models like Whisper. This enables benchmarking of complex, unstructured enterprise data pipelines. +**🖼️ Multi-Modal Capabilities**: RAGPerf supports diverse data modalities beyond plain text. It includes specialized pipelines for Visual RAG (PDFs, Images) using OCR or ColPali visual embeddings, and Audio RAG using ASR models like Whisper. This enables benchmarking of complex, unstructured enterprise data pipelines. --- ## Table of Contents -- [RASB: RAG-based AI System Benchmarking Framework](#rasb-rag-based-ai-system-benchmarking-framework) +- [RAGPerf: RAG-based AI System Benchmarking Framework](#ragperf-rag-based-ai-system-benchmarking-framework) - [Key Features](#key-features) - [Installation](#installation) - [1) Create a virtual environment](#1-create-a-virtual-environment) - [2) Python dependencies](#2-python-dependencies) - [3) Install monitor system](#3-install-monitor-system) - - [Running RASB](#running-rasb) + - [Running RAGPerf](#running-ragperf) - [Quick Start with Web UI](#quick-start-with-web-ui) - [1) Preparation](#1-preparation) - [2) Config your Benchmark and run](#2-config-your-benchmark-and-run) @@ -52,13 +52,13 @@ ## Installation ### 1) Create a virtual environment -To run RASB, we highly recommend using an isolated Python environment (e.g., Conda). +To run RAGPerf, we highly recommend using an isolated Python environment (e.g., Conda). **Conda (recommended)** ```bash # Install Miniconda/Mambaforge from the official site if you don't have Conda -conda create -n rasb python=3.10 -conda activate rasb +conda create -n RAGPerf python=3.10 +conda activate RAGPerf ``` ### 2) Python dependencies @@ -78,10 +78,10 @@ python3 -m pip install -r ../requirement.txt ### 3) Install monitor system -RASB uses a custom, low-overhead monitoring daemon. Please refer to the documentations at [MonitoringSystem README](monitoring_sys/README.md) for compilation and installation instructions. +RAGPerf uses a custom, low-overhead monitoring daemon. Please refer to the documentations at [MonitoringSystem README](monitoring_sys/README.md) for compilation and installation instructions. -## Running RASB -RASB provides an Interactive Web UI for ease of use. Or you can use the Command Line (CLI) for automation. +## Running RAGPerf +RAGPerf provides an Interactive Web UI for ease of use. Or you can use the Command Line (CLI) for automation. ### Quick Start with Web UI #### 1) Preparation @@ -93,11 +93,11 @@ export PYTHONPATH="$REPO_ROOT/src${PYTHONPATH+:$PYTHONPATH}" # Where to cache Hugging Face models (optional, adjust path as needed) export HF_HOME="/mnt/data/hf_home" ``` -Install streamlit and run the RASB client. +Install streamlit and run the RAGPerf client. ```bash # install streamlit python3 -m pip install streamlit -# run RASB +# run RAGPerf streamlit run ui_client.py ``` Open the UI with the reported url with your web browser, the default url is `http://localhost:8501`. @@ -148,9 +148,9 @@ You can check the output result within the `./output` folder. To visualize the o ### VectorDB -RASB already intergrates with many popular vectorDBs. To setup, check the detailed documentations at [VectorDB README](src/vectordb/README.md) +RAGPerf already intergrates with many popular vectorDBs. To setup, check the detailed documentations at [VectorDB README](src/vectordb/README.md) -Want to add a new DB? Check our RASB API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database) to standardize operations. To add a new database +Want to add a new DB? Check our RAGPerf API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database) to standardize operations. To add a new database ### Monitoring System diff --git a/config/README.md b/config/README.md index 7fafcd0..f4317b2 100644 --- a/config/README.md +++ b/config/README.md @@ -1,4 +1,4 @@ -# RASB Configuration Guide +# RAGPerf Configuration Guide This document details the configuration parameters used in the RAG (Retrieval-Augmented Generation) benchmarking pipeline. The configuration file is in YAML format and controls data processing, model selection, hardware allocation, and pipeline execution flow. diff --git a/src/vectordb/README.md b/src/vectordb/README.md index 4158a23..440f795 100644 --- a/src/vectordb/README.md +++ b/src/vectordb/README.md @@ -1,7 +1,7 @@ # Vector Database Module -This module provides a unified interface for interacting with various locally deployable Vector Databases. RASB abstracts the low-level client management, allowing you to switch between different backends (e.g., changing from LanceDB to Milvus) by simply modifying a configuration file. +This module provides a unified interface for interacting with various locally deployable Vector Databases. RAGPerf abstracts the low-level client management, allowing you to switch between different backends (e.g., changing from LanceDB to Milvus) by simply modifying a configuration file. ## 📦 Supported Databases From da944219a55ee9ad99017a02e38eb4b55b494392 Mon Sep 17 00:00:00 2001 From: platformxlab <97150744+platformxlab@users.noreply.github.com> Date: Sat, 10 Jan 2026 16:43:53 -0600 Subject: [PATCH 08/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d5014bd..3034c29 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# RAGPerf: RAG-based AI System Benchmarking Framework +# RAGPerf: A RAG-based AI System Benchmarking Framework **RAGPerf** is an open-source framework designed to benchmark the End-to-End system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. From 1a6d63303bb06530127e11275e9f5a311b0e6eee Mon Sep 17 00:00:00 2001 From: platformxlab <97150744+platformxlab@users.noreply.github.com> Date: Sat, 10 Jan 2026 16:47:05 -0600 Subject: [PATCH 09/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3034c29..e52d473 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# RAGPerf: A RAG-based AI System Benchmarking Framework +# RAGPerf: An End-to-End Benchmarking Framework for Retrieval-Augmented Generation Systems **RAGPerf** is an open-source framework designed to benchmark the End-to-End system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. From 841494143f9bbc312c4b8ed2d95fc6139e1abc7c Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sun, 11 Jan 2026 07:57:40 -0600 Subject: [PATCH 10/23] update --- README.md | 36 ++++++++++++++++++------------------ config/README.md | 4 ++-- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index e52d473..2901175 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # RAGPerf: An End-to-End Benchmarking Framework for Retrieval-Augmented Generation Systems -**RAGPerf** is an open-source framework designed to benchmark the End-to-End system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. +**RAGPerf** is an open-source framework designed to benchmark the end-to-end system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers a user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. [![C/C++ Format Check](https://github.com/platformxlab/RAGPerf/actions/workflows/clang-format.yml/badge.svg)](https://github.com/platformxlab/RAGPerf/actions/workflows/clang-format.yml) @@ -16,22 +16,22 @@ ## Key Features -**🚀 Holistic System-Centric Benchmarking**: RAGPerf moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdown, and hardware efficiency, helping you identify whether a bottleneck lies in I/O-bound retrieval or compute-bound prefill/decoding stages. +**🚀 Holistic System-Centric Benchmarking**: RAGPerf moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdowns, and hardware efficiency. This helps developers identify whether a bottleneck lies in I/O-bound retrieval or compute-bound prefill/decoding stages. -**🧩 Modular Architecture**: RAGPerf employs a configuration-driven design that abstracts the entire RAG pipeline—Embedding, Vector Database, Reranking, and Generation—behind uniform interfaces. You can seamlessly swap components—switching from Milvus to LanceDB, or from ChatGPT to Qwen—without rewriting code. This enables fine-grained analysis of specific component trade-offs. +**🧩 Modular Architecture**: RAGPerf uses a configuration-driven design that abstracts the RAG pipeline (Embedding, Vector Database, Reranking, and Generation) behind uniform interfaces. Users can seamlessly switch components (e.g., switching from Milvus to LanceDB, or ChatGPT to Qwen) without rewriting code. This enables detailed performance comparisons between different components. **📊 Detailed Full-Stack Profiling**: RAGPerf integrates a lightweight system profiler that runs as a background daemon. It captures granular hardware metrics with minimal overhead, including GPU/CPU utilization, memory usage (host RAM vs. GPU VRAM), PCIe throughput, and Disk I/O. This allows for deep analysis of resource contention between RAG components. -**🔄 Dynamic Workload Generation**: Simulates the evolution of real-world knowledge bases. The workload generator can interleave standard search queries with insert, update, and delete operations. This allows you to stress-test how a RAG system handles high-concurrency requests while maintaining data freshness. +**🔄 Dynamic Workload Generation**: RAGPerf is able to simulate the evolution of real-world knowledge bases. The workload generator also supports queries with insert, update, and delete operations, allows users to measure how these operations impact data freshness and overall system performance. -**🖼️ Multi-Modal Capabilities**: RAGPerf supports diverse data modalities beyond plain text. It includes specialized pipelines for Visual RAG (PDFs, Images) using OCR or ColPali visual embeddings, and Audio RAG using ASR models like Whisper. This enables benchmarking of complex, unstructured enterprise data pipelines. +**🖼️ Multi-Modal Capabilities**: RAGPerf supports diverse data modalities beyond plain text. It provides specialized pipelines including Visual RAG (PDFs, Images) using OCR or ColPali visual embeddings, and Audio RAG using ASR models like Whisper. This enables benchmarking of complex, unstructured RAG pipelines. --- ## Table of Contents -- [RAGPerf: RAG-based AI System Benchmarking Framework](#ragperf-rag-based-ai-system-benchmarking-framework) +- [RAGPerf: An End-to-End Benchmarking Framework for Retrieval-Augmented Generation Systems](#ragperf-an-end-to-end-benchmarking-framework-for-retrieval-augmented-generation-systems) - [Key Features](#key-features) - [Installation](#installation) - [1) Create a virtual environment](#1-create-a-virtual-environment) @@ -40,7 +40,7 @@ - [Running RAGPerf](#running-ragperf) - [Quick Start with Web UI](#quick-start-with-web-ui) - [1) Preparation](#1-preparation) - - [2) Config your Benchmark and run](#2-config-your-benchmark-and-run) + - [2) Configure the benchmark and run](#2-configure-the-benchmark-and-run) - [Run with Command Line (CLI)](#run-with-command-line-cli) - [1) Preparation](#1-preparation-1) - [2) Running the Benchmark](#2-running-the-benchmark) @@ -69,7 +69,7 @@ We use `pip-tools` to ensure reproducible dependency resolution. # install pip-compile for python package dependency resolution python3 -m pip install pip-tools -# configure MSys and generate a list of all required python packages +# Generate list of all required python packages mkdir build && cd build cmake .. make generate_py3_requirements @@ -78,7 +78,7 @@ python3 -m pip install -r ../requirement.txt ### 3) Install monitor system -RAGPerf uses a custom, low-overhead monitoring daemon. Please refer to the documentations at [MonitoringSystem README](monitoring_sys/README.md) for compilation and installation instructions. +RAGPerf uses a custom, low-overhead monitoring daemon. Please refer to the documentation at [MonitoringSystem README](monitoring_sys/README.md) for compilation and installation instructions. ## Running RAGPerf RAGPerf provides an Interactive Web UI for ease of use. Or you can use the Command Line (CLI) for automation. @@ -100,12 +100,12 @@ python3 -m pip install streamlit # run RAGPerf streamlit run ui_client.py ``` -Open the UI with the reported url with your web browser, the default url is `http://localhost:8501`. +Open the UI with the reported url in your web browser, the default url is `http://localhost:8501`. -#### 2) Config your Benchmark and run -To run the benchmark, we first need to setup the retriever like a vectorDB. See [vectordb](#vectordb). The in the webpage, customize your own workload setting. ![config](./doc/figures/ragconfig.png) +#### 2) Configure the benchmark and run +To run the benchmark, we first need to set up the retriever like a vectorDB. See [vectordb](#vectordb). Then, on the webpage, customize your own workload setting. ![config](./doc/figures/ragconfig.png) -Then in the execute page, click execute to execute the workload. You may also need to check the config file before the execution, see [here](./config/README.md) for config explaination. ![config](./doc/figures/run.png) +Then in the execute page, click execute to execute the workload. You may also need to check the config file before the execution, see [here](./config/README.md) for config explanation. ![config](./doc/figures/run.png) ### Run with Command Line (CLI) #### 1) Preparation @@ -119,7 +119,7 @@ export HF_HOME="/mnt/data/hf_home" ``` #### 2) Running the Benchmark -To run the benchmark, you first need to setup the vectorDB as the retriever. See [vectordb](#vectordb) for a supported list and quick setup guide. Change the db_path to your local vectordb path in config file. +To run the benchmark, you first need to set up the vectorDB as the retriever. See [vectordb](#vectordb) for a supported list and quick setup guide. Change the db_path to your local vectordb path in config file. ``` vector_db: db_path: /mnt/data/vectordb @@ -134,12 +134,12 @@ python3 src/run_new.py \ ``` After the insertion stage, proceed to the **query/evaluate** stage. Run the following: ```bash -# 2) Retreival and Query +# 2) Retrieval and Query python3 src/run_new.py \ --config config/lance_query.yaml \ --msys-config config/monitor/example_config.yaml ``` -To customize your own workload setting, you may reference the provided config file within `./config` folder. The detailed parameter are listed [here](config/README.md) +To customize your own workload setting, you may reference the provided config file within `./config` folder. The detailed parameters are listed [here](config/README.md) #### 3) Output Analysis You can check the output result within the `./output` folder. To visualize the output results, run `python3 example/monitoring_sys_lib/test_parser.py`, the visualized figures will be located within the `./output`. @@ -148,10 +148,10 @@ You can check the output result within the `./output` folder. To visualize the o ### VectorDB -RAGPerf already intergrates with many popular vectorDBs. To setup, check the detailed documentations at [VectorDB README](src/vectordb/README.md) +RAGPerf already integrates with many popular vectorDBs. To set up, check the detailed documentations at [VectorDB README](src/vectordb/README.md) Want to add a new DB? Check our RAGPerf API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database) to standardize operations. To add a new database ### Monitoring System -Examples of how to use it is documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md) +Examples of how to use it are documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md) diff --git a/config/README.md b/config/README.md index f4317b2..ca71925 100644 --- a/config/README.md +++ b/config/README.md @@ -1,6 +1,6 @@ # RAGPerf Configuration Guide -This document details the configuration parameters used in the RAG (Retrieval-Augmented Generation) benchmarking pipeline. The configuration file is in YAML format and controls data processing, model selection, hardware allocation, and pipeline execution flow. +This document details the configuration parameters used in the RAGPerf. The configuration file is in YAML format and controls data processing, model selection, hardware allocation, and pipeline execution flow. ## 1. Top-Level Metadata @@ -12,7 +12,7 @@ This document details the configuration parameters used in the RAG (Retrieval-Au ## 2. Benchmark Data Settings (`bench`) -This section defines the dataset source and how the text/image is pre-processed before ingestion. +This section defines the dataset source and how text or images are preprocessed before ingestion. ```yaml bench: From 58c782cb490f6bc149e925fd188d431f9fd43b0f Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Sun, 11 Jan 2026 22:53:31 +0800 Subject: [PATCH 11/23] updated wording & formatting --- README.md | 115 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 72 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 2901175..7486e52 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# RAGPerf: An End-to-End Benchmarking Framework for Retrieval-Augmented Generation Systems +# RAGPerf: An End-to-End Benchmarking Framework for Retrieval-Augmented Generation Systems **RAGPerf** is an open-source framework designed to benchmark the end-to-end system performance of Retrieval-Augmented Generation (RAG) applications. Built with a fully modular architecture, it offers a user-friendly and highly customizable framework that allows precise measurement of throughput, latency, and scalability across different RAG configurations. @@ -16,13 +16,13 @@ ## Key Features -**🚀 Holistic System-Centric Benchmarking**: RAGPerf moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdowns, and hardware efficiency. This helps developers identify whether a bottleneck lies in I/O-bound retrieval or compute-bound prefill/decoding stages. +**🚀 Holistic System-Centric Benchmarking**: RAGPerf moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdowns, and hardware efficiency. This helps developers identify potential bottlenecks throughout the entire pipeline. -**🧩 Modular Architecture**: RAGPerf uses a configuration-driven design that abstracts the RAG pipeline (Embedding, Vector Database, Reranking, and Generation) behind uniform interfaces. Users can seamlessly switch components (e.g., switching from Milvus to LanceDB, or ChatGPT to Qwen) without rewriting code. This enables detailed performance comparisons between different components. +**🧩 Modular Architecture**: RAGPerf uses a modular design that abstracts different stages of the RAG pipeline (Embedding, Vector Database, Reranking, and Generation) behind uniform interfaces. Users can seamlessly switch components (e.g., switching underlyinig vector database from Milvus to LanceDB, or change underlying generative model from ChatGPT to Qwen) without rewriting code. This enables detailed performance comparisons between different pipelines. -**📊 Detailed Full-Stack Profiling**: RAGPerf integrates a lightweight system profiler that runs as a background daemon. It captures granular hardware metrics with minimal overhead, including GPU/CPU utilization, memory usage (host RAM vs. GPU VRAM), PCIe throughput, and Disk I/O. This allows for deep analysis of resource contention between RAG components. +**📊 Detailed Full-Stack Profiling**: RAGPerf integrates a lightweight system profiler that runs as a background daemon. It captures fine-grained hardware metrics with minimal overhead, including GPU/CPU utilization, memory consumptions (host RAM & GPU VRAM), PCIe throughput, and disk I/O utilization. This allows detailed analysis of resource utilization between RAG components and help finding potential contention issues. -**🔄 Dynamic Workload Generation**: RAGPerf is able to simulate the evolution of real-world knowledge bases. The workload generator also supports queries with insert, update, and delete operations, allows users to measure how these operations impact data freshness and overall system performance. +**🔄 Simulating Real-World Scenarios**: RAGPerf is able to simulate the evolution of real-world knowledge bases by synthesizing updates with a custom and configurable workload generator. The workload generator supports generating insert, update, and delete requests at different frequency and patterns, allowing users to estimate how data freshness and overall system performance varies in real systems. **🖼️ Multi-Modal Capabilities**: RAGPerf supports diverse data modalities beyond plain text. It provides specialized pipelines including Visual RAG (PDFs, Images) using OCR or ColPali visual embeddings, and Audio RAG using ASR models like Whisper. This enables benchmarking of complex, unstructured RAG pipelines. @@ -31,28 +31,29 @@ ## Table of Contents -- [RAGPerf: An End-to-End Benchmarking Framework for Retrieval-Augmented Generation Systems](#ragperf-an-end-to-end-benchmarking-framework-for-retrieval-augmented-generation-systems) - - [Key Features](#key-features) - - [Installation](#installation) - - [1) Create a virtual environment](#1-create-a-virtual-environment) - - [2) Python dependencies](#2-python-dependencies) - - [3) Install monitor system](#3-install-monitor-system) - - [Running RAGPerf](#running-ragperf) - - [Quick Start with Web UI](#quick-start-with-web-ui) - - [1) Preparation](#1-preparation) - - [2) Configure the benchmark and run](#2-configure-the-benchmark-and-run) - - [Run with Command Line (CLI)](#run-with-command-line-cli) - - [1) Preparation](#1-preparation-1) - - [2) Running the Benchmark](#2-running-the-benchmark) - - [3) Output Analysis](#3-output-analysis) - - [Supported RAG Pipeline Modules](#supported-rag-pipeline-modules) - - [VectorDB](#vectordb) - - [Monitoring System](#monitoring-system) +- [Key Features](#key-features) +- [Installation](#installation) + - [Create a Virtual Environment](#create-a-virtual-environment) + - [Install Dependencies](#install-dependencies) + - [Install Monitoring System](#install-monitoring-system) +- [Running RAGPerf](#running-ragperf) + - [Quick Start with Web UI](#quick-start-with-web-ui) + - [Preparation](#preparation) + - [Configuring the Benchmark](#configuring-the-benchmark) + - [Running the Benchmark](#running-the-benchmark) + - [Run with Command Line Interface](#run-with-command-line-interface) + - [Preparation](#preparation-1) + - [Running the Benchmark](#running-the-benchmark-1) + - [Performing Analysis](#performing-analysis) +- [Supported RAG Pipeline Modules](#supported-rag-pipeline-modules) + - [Vector Databases](#vector-databases) + - [Monitoring System](#monitoring-system) ## Installation -### 1) Create a virtual environment -To run RAGPerf, we highly recommend using an isolated Python environment (e.g., Conda). +### Create a Virtual Environment + +To run RAGPerf, we highly recommend using an isolated Python environment using a Python virtual environment manager (e.g., `venv`, `conda`) to avoid package conflicts, we use `conda` for demonstrating purposes throughout the documentation. **Conda (recommended)** ```bash @@ -61,7 +62,8 @@ conda create -n RAGPerf python=3.10 conda activate RAGPerf ``` -### 2) Python dependencies +### Install Dependencies + Execute the following instructions to install all the dependencies for the project. We use `pip-tools` to ensure reproducible dependency resolution. @@ -69,23 +71,30 @@ We use `pip-tools` to ensure reproducible dependency resolution. # install pip-compile for python package dependency resolution python3 -m pip install pip-tools -# Generate list of all required python packages +# generate list of all required python packages mkdir build && cd build cmake .. make generate_py3_requirements + +# install the dependencies python3 -m pip install -r ../requirement.txt ``` -### 3) Install monitor system +### Install Monitoring System + RAGPerf uses a custom, low-overhead monitoring daemon. Please refer to the documentation at [MonitoringSystem README](monitoring_sys/README.md) for compilation and installation instructions. ## Running RAGPerf + RAGPerf provides an Interactive Web UI for ease of use. Or you can use the Command Line (CLI) for automation. ### Quick Start with Web UI -#### 1) Preparation -Set these once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell: + +#### Preparation + +Set these once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell + ```bash # Make local "src" importable export PYTHONPATH="$REPO_ROOT/src${PYTHONPATH+:$PYTHONPATH}" @@ -93,37 +102,53 @@ export PYTHONPATH="$REPO_ROOT/src${PYTHONPATH+:$PYTHONPATH}" # Where to cache Hugging Face models (optional, adjust path as needed) export HF_HOME="/mnt/data/hf_home" ``` + Install streamlit and run the RAGPerf client. + ```bash # install streamlit python3 -m pip install streamlit # run RAGPerf streamlit run ui_client.py ``` + Open the UI with the reported url in your web browser, the default url is `http://localhost:8501`. -#### 2) Configure the benchmark and run -To run the benchmark, we first need to set up the retriever like a vectorDB. See [vectordb](#vectordb). Then, on the webpage, customize your own workload setting. ![config](./doc/figures/ragconfig.png) +#### Configuring the Benchmark + +To run the benchmark, we first need to set up the vector database (See [vectordb](#vectordb) for more details). Then, customize your own workload settings with all the available options on the webpage. + +![config](./doc/figures/ragconfig.png) + +#### Running the Benchmark -Then in the execute page, click execute to execute the workload. You may also need to check the config file before the execution, see [here](./config/README.md) for config explanation. ![config](./doc/figures/run.png) +In the execute page, click the `START BENCHMARK` button to execute the workload already configured. You may also want to check if all the configs are set correctly, see [here](./config/README.md) for the detailed explanation for different entries in the config file. + +![config](./doc/figures/run.png) + +### Run with Command Line Interface + +#### Preparation + +Set these environment variables once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell -### Run with Command Line (CLI) -#### 1) Preparation -Set these once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell: ```bash -# Make local "src" importable +# Make local `src` module importable export PYTHONPATH="$REPO_ROOT/src${PYTHONPATH+:$PYTHONPATH}" # Where to cache Hugging Face models (optional, adjust path as needed) export HF_HOME="/mnt/data/hf_home" ``` -#### 2) Running the Benchmark -To run the benchmark, you first need to set up the vectorDB as the retriever. See [vectordb](#vectordb) for a supported list and quick setup guide. Change the db_path to your local vectordb path in config file. -``` +#### Running the Benchmark + +To run the benchmark, you first need to set up the vector database as the retriever. See [vectordb](#vectordb) for a supported list and quick setup guide. Change the db_path to your local vector database storage path in config file. + +```yaml vector_db: db_path: /mnt/data/vectordb ``` + First run the **preprocess/insert** phase to insert the dataset: ```bash @@ -132,25 +157,29 @@ python3 src/run_new.py \ --config config/lance_insert.yaml \ --msys-config config/monitor/example_config.yaml ``` + After the insertion stage, proceed to the **query/evaluate** stage. Run the following: + ```bash # 2) Retrieval and Query python3 src/run_new.py \ --config config/lance_query.yaml \ --msys-config config/monitor/example_config.yaml ``` + To customize your own workload setting, you may reference the provided config file within `./config` folder. The detailed parameters are listed [here](config/README.md) -#### 3) Output Analysis +#### Performing Analysis + You can check the output result within the `./output` folder. To visualize the output results, run `python3 example/monitoring_sys_lib/test_parser.py`, the visualized figures will be located within the `./output`. ## Supported RAG Pipeline Modules -### VectorDB +### Vector Databases -RAGPerf already integrates with many popular vectorDBs. To set up, check the detailed documentations at [VectorDB README](src/vectordb/README.md) +RAGPerf already integrates with many popular vector databases. To set up, check the detailed documentations at [VectorDB README](src/vectordb/README.md) -Want to add a new DB? Check our RAGPerf API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database) to standardize operations. To add a new database +Want to add a new DB? Check our RAGPerf API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database). This benchmark suit can automatically perform profiling and analysis on your desired vector database after implementing these APIs. ### Monitoring System From a3600ab82bc17b6d24614b2faaf3186f61d7c46b Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Sun, 11 Jan 2026 23:14:03 +0800 Subject: [PATCH 12/23] migrated some msys doc to main --- README.md | 53 +++++++++++++++++++++++++++++++++++++++- monitoring_sys/README.md | 6 ++--- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7486e52..dd7d08b 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,9 @@ - [Create a Virtual Environment](#create-a-virtual-environment) - [Install Dependencies](#install-dependencies) - [Install Monitoring System](#install-monitoring-system) + - [C++ 20 Compatible Compiler Installation](#c-20-compatible-compiler-installation) + - [Protobuf Installation](#protobuf-installation) + - [Build MSys Shared Library and Position the Output Product to `src/monitoring_sys`](#build-msys-shared-library-and-position-the-output-product-to-srcmonitoring_sys) - [Running RAGPerf](#running-ragperf) - [Quick Start with Web UI](#quick-start-with-web-ui) - [Preparation](#preparation) @@ -83,7 +86,55 @@ python3 -m pip install -r ../requirement.txt ### Install Monitoring System -RAGPerf uses a custom, low-overhead monitoring daemon. Please refer to the documentation at [MonitoringSystem README](monitoring_sys/README.md) for compilation and installation instructions. +RAGPerf uses a custom, low-overhead monitoring daemon. Here is a stripped down version of installation procedures (please refer to [MonitoringSystem README](monitoring_sys/README.md) for more detailed instructions and explanations). + +#### C++ 20 Compatible Compiler Installation + +Check if system compiler already have the capability, if so, this step can be skipped. + +To install a C++ 20 compatible compiler in the virtual environment, for example, `gcc=12.1.0`, run + +```bash +conda install -c conda-forge gcc=12.1.0 +``` + +#### Protobuf Installation + +Install protobuf compiler and runtime library (modified from +[PROTOBUF_CMAKE](https://github.com/protocolbuffers/protobuf/blob/main/cmake/README.md)). +Currently, we are using version `v30.2`. + +```bash +# clone the protobuf repository somewhere +git clone https://github.com/protocolbuffers/protobuf.git +cd protobuf +# init and switch to desired version +git submodule update --init --recursive +git checkout v30.2 +# make & install to ~/.local +mkdir build && cd build +cmake .. -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DBUILD_SHARED_LIBS=ON \ + -Dprotobuf_BUILD_SHARED_LIBS=ON \ + -Dprotobuf_BUILD_TESTS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="$HOME/.local" +cmake --build . --config Release -j +make install -j +``` + +#### Build MSys Shared Library and Position the Output Product to `src/monitoring_sys` + +Run the following commands in the project's build folder. + +```bash +# enter the python virtual environment +cmake -DCMAKE_BUILD_TYPE=Release .. +make libmsys_pymod -j +``` + +Make sure you see something like `libmsys.cpython-310-x86_64-linux-gnu.so` (the exact name could depend on your python version and architecture), that is the *cpython* module for the monitoring system executable. ## Running RAGPerf diff --git a/monitoring_sys/README.md b/monitoring_sys/README.md index a17747b..51d9568 100644 --- a/monitoring_sys/README.md +++ b/monitoring_sys/README.md @@ -45,12 +45,12 @@ Install protobuf compiler and runtime library (modified from Currently, we are using version `v30.2`. ```bash -# Clone the protobuf repository +# clone the protobuf repository git clone https://github.com/protocolbuffers/protobuf.git cd protobuf git submodule update --init --recursive git checkout v30.2 -# Make & Install to ~/.local +# make & Install to ~/.local (can be configured) mkdir build && cd build cmake .. -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DBUILD_SHARED_LIBS=ON \ @@ -58,7 +58,7 @@ cmake .. -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -Dprotobuf_BUILD_TESTS=OFF \ -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX="$HOME/.local" + -DCMAKE_INSTALL_PREFIX="$HOME/.local" # can be configured cmake --build . --config Release -j make install -j ``` From b30e4efb4b733c3c58b161b17723f8ebef175808 Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Sun, 11 Jan 2026 23:17:48 +0800 Subject: [PATCH 13/23] make some instructions more clear --- README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index dd7d08b..b01654b 100644 --- a/README.md +++ b/README.md @@ -90,9 +90,7 @@ RAGPerf uses a custom, low-overhead monitoring daemon. Here is a stripped down v #### C++ 20 Compatible Compiler Installation -Check if system compiler already have the capability, if so, this step can be skipped. - -To install a C++ 20 compatible compiler in the virtual environment, for example, `gcc=12.1.0`, run +Install a C++ 20 compatible compiler in the virtual environment, for example, `gcc=12.1.0`, run ```bash conda install -c conda-forge gcc=12.1.0 @@ -100,9 +98,7 @@ conda install -c conda-forge gcc=12.1.0 #### Protobuf Installation -Install protobuf compiler and runtime library (modified from -[PROTOBUF_CMAKE](https://github.com/protocolbuffers/protobuf/blob/main/cmake/README.md)). -Currently, we are using version `v30.2`. +Install protobuf compiler and runtime library (modified from [PROTOBUF_CMAKE](https://github.com/protocolbuffers/protobuf/blob/main/cmake/README.md)). Currently, we are using version `v30.2`. ```bash # clone the protobuf repository somewhere From 600c865cf4eb9a923e6145bdf719b3a905077267 Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sun, 11 Jan 2026 09:25:25 -0600 Subject: [PATCH 14/23] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b01654b..9a15162 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ RAGPerf uses a custom, low-overhead monitoring daemon. Here is a stripped down v #### C++ 20 Compatible Compiler Installation -Install a C++ 20 compatible compiler in the virtual environment, for example, `gcc=12.1.0`, run +Install a C++ 20 compatible compiler in the virtual environment. For example, to install `gcc=12.1.0`, run ```bash conda install -c conda-forge gcc=12.1.0 @@ -130,7 +130,7 @@ cmake -DCMAKE_BUILD_TYPE=Release .. make libmsys_pymod -j ``` -Make sure you see something like `libmsys.cpython-310-x86_64-linux-gnu.so` (the exact name could depend on your python version and architecture), that is the *cpython* module for the monitoring system executable. +Make sure you see the file `libmsys.cpython-310-x86_64-linux-gnu.so` (the exact name could depend on your python version and architecture), that is the *cpython* module for the monitoring system executable. ## Running RAGPerf From 7528cf91d4cc982fa7bfb781af28459b0a45afc8 Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sun, 11 Jan 2026 09:25:39 -0600 Subject: [PATCH 15/23] update readme --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9a15162..e0ca204 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ RAGPerf provides an Interactive Web UI for ease of use. Or you can use the Comma #### Preparation -Set these once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell +Set these once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell. ```bash # Make local "src" importable @@ -177,7 +177,7 @@ In the execute page, click the `START BENCHMARK` button to execute the workload #### Preparation -Set these environment variables once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell +Set these environment variables once in your shell rc file (e.g., `~/.bashrc` or `~/.zshrc`) or export them in every new shell. ```bash # Make local `src` module importable @@ -214,7 +214,7 @@ python3 src/run_new.py \ --msys-config config/monitor/example_config.yaml ``` -To customize your own workload setting, you may reference the provided config file within `./config` folder. The detailed parameters are listed [here](config/README.md) +To customize your own workload setting, you may reference the provided config file within `./config` folder. The detailed parameters are listed [here](config/README.md). #### Performing Analysis @@ -224,10 +224,10 @@ You can check the output result within the `./output` folder. To visualize the o ### Vector Databases -RAGPerf already integrates with many popular vector databases. To set up, check the detailed documentations at [VectorDB README](src/vectordb/README.md) +RAGPerf already integrates with many popular vector databases. To set up, check the detailed documentations at [VectorDB README](src/vectordb/README.md). Want to add a new DB? Check our RAGPerf API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database). This benchmark suit can automatically perform profiling and analysis on your desired vector database after implementing these APIs. ### Monitoring System -Examples of how to use it are documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md) +Examples of how to use it are documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md). From 08d227cdf446b181565fdc879fa63c9548b3a2a3 Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Sun, 11 Jan 2026 23:40:23 +0800 Subject: [PATCH 16/23] modified style --- README.md | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index e0ca204..784bd55 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,6 @@ - [Install Dependencies](#install-dependencies) - [Install Monitoring System](#install-monitoring-system) - [C++ 20 Compatible Compiler Installation](#c-20-compatible-compiler-installation) - - [Protobuf Installation](#protobuf-installation) - [Build MSys Shared Library and Position the Output Product to `src/monitoring_sys`](#build-msys-shared-library-and-position-the-output-product-to-srcmonitoring_sys) - [Running RAGPerf](#running-ragperf) - [Quick Start with Web UI](#quick-start-with-web-ui) @@ -67,8 +66,7 @@ conda activate RAGPerf ### Install Dependencies -Execute the following instructions to install all the dependencies for the project. -We use `pip-tools` to ensure reproducible dependency resolution. +Execute the following instructions to install all the dependencies for the project. We use `pip-tools` to ensure reproducible dependency resolution. ```bash # install pip-compile for python package dependency resolution @@ -96,30 +94,6 @@ Install a C++ 20 compatible compiler in the virtual environment. For example, to conda install -c conda-forge gcc=12.1.0 ``` -#### Protobuf Installation - -Install protobuf compiler and runtime library (modified from [PROTOBUF_CMAKE](https://github.com/protocolbuffers/protobuf/blob/main/cmake/README.md)). Currently, we are using version `v30.2`. - -```bash -# clone the protobuf repository somewhere -git clone https://github.com/protocolbuffers/protobuf.git -cd protobuf -# init and switch to desired version -git submodule update --init --recursive -git checkout v30.2 -# make & install to ~/.local -mkdir build && cd build -cmake .. -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DBUILD_SHARED_LIBS=ON \ - -Dprotobuf_BUILD_SHARED_LIBS=ON \ - -Dprotobuf_BUILD_TESTS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX="$HOME/.local" -cmake --build . --config Release -j -make install -j -``` - #### Build MSys Shared Library and Position the Output Product to `src/monitoring_sys` Run the following commands in the project's build folder. @@ -150,7 +124,7 @@ export PYTHONPATH="$REPO_ROOT/src${PYTHONPATH+:$PYTHONPATH}" export HF_HOME="/mnt/data/hf_home" ``` -Install streamlit and run the RAGPerf client. +Install `streamlit` and run the RAGPerf client. ```bash # install streamlit @@ -181,7 +155,8 @@ Set these environment variables once in your shell rc file (e.g., `~/.bashrc` or ```bash # Make local `src` module importable -export PYTHONPATH="$REPO_ROOT/src${PYTHONPATH+:$PYTHONPATH}" +# set variable REPO_ROOT to correct path to the repo +export PYTHONPATH="$REPO_ROOT/src$PYTHONPATH" # Where to cache Hugging Face models (optional, adjust path as needed) export HF_HOME="/mnt/data/hf_home" @@ -196,7 +171,7 @@ vector_db: db_path: /mnt/data/vectordb ``` -First run the **preprocess/insert** phase to insert the dataset: +First run the **preprocess/insert** phase to insert the dataset. ```bash # 1) Build/insert into the vector store (LanceDB example) @@ -205,7 +180,7 @@ python3 src/run_new.py \ --msys-config config/monitor/example_config.yaml ``` -After the insertion stage, proceed to the **query/evaluate** stage. Run the following: +After the insertion stage, proceed to the **query/evaluate** stage. ```bash # 2) Retrieval and Query @@ -214,11 +189,11 @@ python3 src/run_new.py \ --msys-config config/monitor/example_config.yaml ``` -To customize your own workload setting, you may reference the provided config file within `./config` folder. The detailed parameters are listed [here](config/README.md). +To customize your own workload setting, you may reference the provided config file within `config` folder. The detailed parameters are listed [here](config/README.md). #### Performing Analysis -You can check the output result within the `./output` folder. To visualize the output results, run `python3 example/monitoring_sys_lib/test_parser.py`, the visualized figures will be located within the `./output`. +You can check the output result within the `output` folder. To visualize the output results, run `python3 example/monitoring_sys_lib/test_parser.py`, the visualized figures will be located within the `output`. ## Supported RAG Pipeline Modules From c7ae48036afa9869cb4b6e50468173b23e5c17c7 Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Sun, 11 Jan 2026 23:40:37 +0800 Subject: [PATCH 17/23] make table look better in code --- config/README.md | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/config/README.md b/config/README.md index ca71925..3423032 100644 --- a/config/README.md +++ b/config/README.md @@ -4,8 +4,8 @@ This document details the configuration parameters used in the RAGPerf. The conf ## 1. Top-Level Metadata -| Parameter | Description | -| :--- | :--- | +| Parameter | Description | +| :------------- | :------------------------------------------------------------------------------------------------------------------------------ | | **`run_name`** | A unique identifier for the current experiment (e.g., `default_run`). This is used for naming log files and output directories. | --- @@ -50,12 +50,12 @@ rag: ### 3.2 Embedding (`embedding`) Configuration for the model that converts text/images into vectors. -| Parameter | Description | -| :--- | :--- | -| `device` | GPU device identifier (e.g., `cuda:0`). | +| Parameter | Description | +| :--------------------------- | :------------------------------------------------------------------- | +| `device` | GPU device identifier (e.g., `cuda:0`). | | `sentence_transformers_name` | Name of the model (e.g., `all-MiniLM-L6-v2`, `vidore/colpali-v1.2`). | -| `batch_size` | Number of items processed per batch during embedding. | -| `embedding_framework` | Backend framework (e.g., `sentence_transformers`). | +| `batch_size` | Number of items processed per batch during embedding. | +| `embedding_framework` | Backend framework (e.g., `sentence_transformers`). | ### 3.3 Vector Database Operations (`insert`, `build_index`) Parameters for writing data and creating efficient search structures. @@ -89,16 +89,16 @@ rag: ### 3.5 Generation (`generation`) Settings for the Large Language Model (LLM) that generates the final answer. -| Parameter | Description | -| :--- | :--- | -| `device` | GPU device identifier. | -| `model` | Path or name of the LLM (e.g., `Qwen/Qwen2.5-7B-Instruct`). | +| Parameter | Description | +| :-------- | :---------------------------------------------------------- | +| `device` | GPU device identifier. | +| `model` | Path or name of the LLM (e.g., `Qwen/Qwen2.5-7B-Instruct`). | ### 3.6 Evaluation (`evaluate`) Settings for automated quality assessment (e.g., using RAGAS). -| Parameter | Description | -| :--- | :--- | +| Parameter | Description | +| :---------------- | :----------------------------------------------------- | | `evaluator_model` | Model used as the judge for metrics like faithfulness. | --- @@ -121,14 +121,14 @@ sys: ### 4.2 Devices (`devices`) -| Parameter | Description | -| :--- | :--- | -| `cpu` | CPU identifier. | -| `gpu_count` | Number of GPUs available to the system. | -| `gpus` | List of specific GPU IDs (e.g., `["cuda:0", "cuda:1"]`). | +| Parameter | Description | +| :---------- | :------------------------------------------------------- | +| `cpu` | CPU identifier. | +| `gpu_count` | Number of GPUs available to the system. | +| `gpus` | List of specific GPU IDs (e.g., `["cuda:0", "cuda:1"]`). | ### 4.3 Logging (`log`) -| Parameter | Description | -| :--- | :--- | +| Parameter | Description | +| :------------ | :------------------------------------ | | `metrics_log` | Path for the main execution log file. | \ No newline at end of file From 17a1428c8eebb681c0ed9ad12ebb248715c10bf4 Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Sun, 11 Jan 2026 23:49:07 +0800 Subject: [PATCH 18/23] formatting reaadme --- src/vectordb/README.md | 70 ++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/src/vectordb/README.md b/src/vectordb/README.md index 440f795..6dd47cf 100644 --- a/src/vectordb/README.md +++ b/src/vectordb/README.md @@ -7,13 +7,13 @@ This module provides a unified interface for interacting with various locally de We currently support the following vector databases and index types: -| Database | Supported Index Types | Device Support | Notes | -| :--- | :--- | :--- | :--- | -| **LanceDB** | IVF-PQ, IVF-Flat, HNSW | CPU/GPU | Embedded, serverless, highly memory efficient. | -| **Milvus** | HNSW, IVF, DiskANN, ScaNN | CPU/GPU | Requires a running server instance (Docker/K8s). | -| **Qdrant** | HNSW | CPU/GPU | Requires a running server instance. | -| **Chroma** | HNSW | CPU | Embedded or Client/Server. | -| **Elasticsearch** | HNSW, Flat | CPU | Requires a running server instance. | +| Database | Supported Index Types | Device Support | Notes | +| :---------------- | :------------------------ | :------------- | :----------------------------------------------- | +| **LanceDB** | IVF-PQ, IVF-Flat, HNSW | CPU/GPU | Embedded, serverless, highly memory efficient. | +| **Milvus** | HNSW, IVF, DiskANN, ScaNN | CPU/GPU | Requires a running server instance (Docker/K8s). | +| **Qdrant** | HNSW | CPU/GPU | Requires a running server instance. | +| **Chroma** | HNSW | CPU | Embedded or Client/Server. | +| **Elasticsearch** | HNSW, Flat | CPU | Requires a running server instance. | --- @@ -22,6 +22,7 @@ We currently support the following vector databases and index types: Before running the benchmark, ensure you have installed the necessary Python drivers. If you followed the main installation guide, these should already be in your environment. ### 1. LanceDB (Recommended for Local Testing) + LanceDB runs in-process and does not require a separate server. * **Prerequisites:** `pip install lancedb` @@ -37,17 +38,18 @@ vector_db: ### 2. Milvus (GPU via Docker Compose) -If you plan to use **Milvus** as the vector store, follow the official guide to run Milvus with GPU support using Docker Compose: +If you plan to use **Milvus** as the vector store, follow the official guide to run Milvus with GPU support using Docker Compose: ➡️ **[Run Milvus with GPU Support Using Docker Compose](https://milvus.io/docs/install_standalone-docker-compose-gpu.md)** After Milvus is up, point your pipeline config to its url: + ```yaml - vector_db: - collection_name: 'milvus_test' - db_path: http://localhost:19530 - db_token: root:Milvus - drop_previous_collection: false - type: milvus +vector_db: + collection_name: 'milvus_test' + db_path: http://localhost:19530 + db_token: root:Milvus + drop_previous_collection: false + type: milvus ``` 3. Qdrant (Docker) @@ -60,12 +62,12 @@ docker run -p 6333:6333 -p 6334:6334 \ ``` Change the configuration to use Qdrant: ```yaml - vector_db: - type: "qdrant" - db_path: "http://localhost:6333" # Qdrant server URL - collection_name: "test_collection" - # Qdrant doesn't typically need a token for local docker, but if configured: - # db_token: "your-api-key" +vector_db: + type: "qdrant" + db_path: "http://localhost:6333" # Qdrant server URL + collection_name: "test_collection" + # Qdrant doesn't typically need a token for local docker, but if configured: + # db_token: "your-api-key" ``` 4. Chroma (Embedded or Client/Server) @@ -75,28 +77,28 @@ Chroma is often used in an embedded mode (similar to LanceDB) but can also run a * **Storage:** Data is stored in a local directory (e.g., `./chroma_data`). ```yaml - vector_db: - type: "chroma" - db_path: "./chroma_data" # Local path for persistence - collection_name: "chroma_test" +vector_db: + type: "chroma" + db_path: "./chroma_data" # Local path for persistence + collection_name: "chroma_test" ``` 5. Elasticsearch (Docker with kNN) Elasticsearch supports dense vector search natively. Ensure you have the necessary memory allocated to Docker. * Run Elasticsearch with docker: - ```bash - docker run -p 9200:9200 -e "discovery.type=single-node" \ - -e "xpack.security.enabled=false" \ - -m 4GB docker.elastic.co/elasticsearch/elasticsearch:8.11.1 +```bash +docker run -p 9200:9200 -e "discovery.type=single-node" \ + -e "xpack.security.enabled=false" \ + -m 4GB docker.elastic.co/elasticsearch/elasticsearch:8.11.1 ``` Configuration: ```yaml - vector_db: - type: "elasticsearch" - db_path: "http://localhost:9200" - collection_name: "elastic_test" - drop_previous_collection: true # Elastic indices often need fresh creation for mapping changes +vector_db: + type: "elasticsearch" + db_path: "http://localhost:9200" + collection_name: "elastic_test" + drop_previous_collection: true # Elastic indices often need fresh creation for mapping changes ``` @@ -105,6 +107,7 @@ This pipeline uses an abstract base class, DBInstance (defined in [DBInstance.py 1. Create a New Class: Create a new file (e.g., MyNewDB.py) in vectordb. 2. Inherit form DBInstance: Implement all abstract methods defined in the base class. + ```python from .DBInstance import DBInstance @@ -133,4 +136,5 @@ class MyNewDB(DBInstance): # Search logic returning top_k results pass ``` + 3. Register the Class: Add your new class to the in `run_new.py` so it can be instantiated via the config type string. From 79df306f624ea4b7d9b8e93c0ef000f561260bcc Mon Sep 17 00:00:00 2001 From: Shaobo Li Date: Sun, 11 Jan 2026 09:58:26 -0600 Subject: [PATCH 19/23] update readme --- src/vectordb/README.md | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/src/vectordb/README.md b/src/vectordb/README.md index 6dd47cf..a966689 100644 --- a/src/vectordb/README.md +++ b/src/vectordb/README.md @@ -19,16 +19,13 @@ We currently support the following vector databases and index types: ## 🛠️ Setup Instructions by Type -Before running the benchmark, ensure you have installed the necessary Python drivers. If you followed the main installation guide, these should already be in your environment. +Before running the benchmark, ensure you have installed the necessary Python dependencies. If you followed the main installation guide, these should already be in your environment. ### 1. LanceDB (Recommended for Local Testing) -LanceDB runs in-process and does not require a separate server. +LanceDB does not require a separate server installation. To install lanceDB, run `pip install lancedb`. -* **Prerequisites:** `pip install lancedb` -* **Storage:** Data is stored in a local directory (e.g., `./lancedb_data`). - -**Configuration (`config/your_config.yaml`):** +Change the configuration (`config/your_config.yaml`): ```yaml vector_db: type: "lancedb" @@ -52,7 +49,7 @@ vector_db: type: milvus ``` -3. Qdrant (Docker) +### 3. Qdrant (Docker) To use Qdrant, run the official Docker container. This exposes the database on port 6333. ```bash @@ -70,29 +67,25 @@ vector_db: # db_token: "your-api-key" ``` -4. Chroma (Embedded or Client/Server) -Chroma is often used in an embedded mode (similar to LanceDB) but can also run as a server. The default setup here assumes a persistent local storage mode. - -* **Prerequisites:** `pip install chroma_db` -* **Storage:** Data is stored in a local directory (e.g., `./chroma_data`). +### 4. Chroma (Embedded or Client/Server) +Chroma is often used in an embedded mode (similar to LanceDB) but can also run as a server. To quickly set up the chroma, run `pip install chroma_db`. +Change the configuration to use Chroma: ```yaml vector_db: type: "chroma" - db_path: "./chroma_data" # Local path for persistence + db_path: "./chroma_data" # Local path for db data storage collection_name: "chroma_test" ``` -5. Elasticsearch (Docker with kNN) -Elasticsearch supports dense vector search natively. Ensure you have the necessary memory allocated to Docker. - -* Run Elasticsearch with docker: +### 5. Elasticsearch (Docker with kNN) +Elasticsearch supports dense vector search natively. Ensure you have the necessary memory allocated to Docker. Run Elasticsearch with docker: ```bash docker run -p 9200:9200 -e "discovery.type=single-node" \ -e "xpack.security.enabled=false" \ -m 4GB docker.elastic.co/elasticsearch/elasticsearch:8.11.1 ``` -Configuration: +Change the configuration to include the elasticsearch url: ```yaml vector_db: type: "elasticsearch" From d5eaa82334aebc780b703f0c121879b4d927c55a Mon Sep 17 00:00:00 2001 From: Jian Huang <33913768+jianhtech@users.noreply.github.com> Date: Sun, 11 Jan 2026 11:10:09 -0600 Subject: [PATCH 20/23] Revise README for improved clarity and consistency Updated wording for clarity and consistency in the README. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 784bd55..f8216f8 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,11 @@ **🚀 Holistic System-Centric Benchmarking**: RAGPerf moves beyond simple accuracy metrics to profile the performance of RAG systems. It measures end-to-end throughput (QPS), latency breakdowns, and hardware efficiency. This helps developers identify potential bottlenecks throughout the entire pipeline. -**🧩 Modular Architecture**: RAGPerf uses a modular design that abstracts different stages of the RAG pipeline (Embedding, Vector Database, Reranking, and Generation) behind uniform interfaces. Users can seamlessly switch components (e.g., switching underlyinig vector database from Milvus to LanceDB, or change underlying generative model from ChatGPT to Qwen) without rewriting code. This enables detailed performance comparisons between different pipelines. +**🧩 Modular Architecture**: RAGPerf uses a modular design that abstracts different stages of the RAG pipeline (Embedding, Vector Database, Reranking, and Generation) behind uniform interfaces. Users can seamlessly switch components (e.g., switching underlyinig vector database from Milvus to LanceDB, or change underlying generative model from GPT to Qwen) without rewriting code. This enables detailed performance comparisons between different system settings. -**📊 Detailed Full-Stack Profiling**: RAGPerf integrates a lightweight system profiler that runs as a background daemon. It captures fine-grained hardware metrics with minimal overhead, including GPU/CPU utilization, memory consumptions (host RAM & GPU VRAM), PCIe throughput, and disk I/O utilization. This allows detailed analysis of resource utilization between RAG components and help finding potential contention issues. +**📊 Detailed Full-Stack Profiling**: RAGPerf integrates a lightweight profiler that runs as a background daemon. It captures fine-grained hardware metrics with minimal overhead, including GPU/CPU utilization, memory consumptions (host RAM & GPU VRAM), PCIe throughput, and disk I/O utilization. This allows detailed analysis of resource utilization between RAG components and help identify potential system bottlenecks. -**🔄 Simulating Real-World Scenarios**: RAGPerf is able to simulate the evolution of real-world knowledge bases by synthesizing updates with a custom and configurable workload generator. The workload generator supports generating insert, update, and delete requests at different frequency and patterns, allowing users to estimate how data freshness and overall system performance varies in real systems. +**🔄 Simulating Real-World Scenarios**: RAGPerf is able to simulate the evolution of real-world knowledge bases by synthesizing updates with a custom and configurable workload generator. The workload generator supports insert, update, and delete requests at different frequency and patterns, allowing users to estimate how data freshness and system performance varies in real systems. **🖼️ Multi-Modal Capabilities**: RAGPerf supports diverse data modalities beyond plain text. It provides specialized pipelines including Visual RAG (PDFs, Images) using OCR or ColPali visual embeddings, and Audio RAG using ASR models like Whisper. This enables benchmarking of complex, unstructured RAG pipelines. From 9dd36e2dc5f5a64da3e7e0b0ca33f24cdbf53b7a Mon Sep 17 00:00:00 2001 From: Jian Huang <33913768+jianhtech@users.noreply.github.com> Date: Sun, 11 Jan 2026 11:10:50 -0600 Subject: [PATCH 21/23] Update table of contents in README.md Removed 'Key Features' section from the table of contents. --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index f8216f8..7571f27 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,6 @@ ## Table of Contents -- [Key Features](#key-features) - [Installation](#installation) - [Create a Virtual Environment](#create-a-virtual-environment) - [Install Dependencies](#install-dependencies) From 122ace7cd21714d04a716d0fdc04454d695b5097 Mon Sep 17 00:00:00 2001 From: Jian Huang <33913768+jianhtech@users.noreply.github.com> Date: Sun, 11 Jan 2026 11:12:56 -0600 Subject: [PATCH 22/23] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7571f27..315fdab 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ ### Create a Virtual Environment -To run RAGPerf, we highly recommend using an isolated Python environment using a Python virtual environment manager (e.g., `venv`, `conda`) to avoid package conflicts, we use `conda` for demonstrating purposes throughout the documentation. +To run RAGPerf, we highly recommend using an isolated Python environment. You can use a Python virtual environment manager (e.g., `venv`, `conda`) to avoid package conflicts. We use `conda` for demonstrating purposes throughout the documentation. **Conda (recommended)** ```bash From b9cae3b9eef5abeb0fd2d575ef8cb52fdbcc4dc3 Mon Sep 17 00:00:00 2001 From: Jian Huang <33913768+jianhtech@users.noreply.github.com> Date: Sun, 11 Jan 2026 11:21:19 -0600 Subject: [PATCH 23/23] Fix typos and improve clarity in README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 315fdab..3b6e96f 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ python3 -m pip install -r ../requirement.txt ### Install Monitoring System -RAGPerf uses a custom, low-overhead monitoring daemon. Here is a stripped down version of installation procedures (please refer to [MonitoringSystem README](monitoring_sys/README.md) for more detailed instructions and explanations). +RAGPerf uses a custom, low-overhead monitoring daemon. Here is a stripped-down version of the installation procedure (please refer to [MonitoringSystem README](monitoring_sys/README.md) for detailed instructions and explanations). #### C++ 20 Compatible Compiler Installation @@ -93,7 +93,7 @@ Install a C++ 20 compatible compiler in the virtual environment. For example, to conda install -c conda-forge gcc=12.1.0 ``` -#### Build MSys Shared Library and Position the Output Product to `src/monitoring_sys` +#### Build MSys Shared Library and Position the Output to `src/monitoring_sys` Run the following commands in the project's build folder. @@ -103,11 +103,11 @@ cmake -DCMAKE_BUILD_TYPE=Release .. make libmsys_pymod -j ``` -Make sure you see the file `libmsys.cpython-310-x86_64-linux-gnu.so` (the exact name could depend on your python version and architecture), that is the *cpython* module for the monitoring system executable. +Make sure you see the file `libmsys.cpython-310-x86_64-linux-gnu.so` (the exact name could depend on your python version and architecture), this is the *cpython* module for the monitoring system executable. ## Running RAGPerf -RAGPerf provides an Interactive Web UI for ease of use. Or you can use the Command Line (CLI) for automation. +RAGPerf provides an Interactive Web UI for ease of use. And of course, you can use the Command Line (CLI) for automation. ### Quick Start with Web UI @@ -142,7 +142,7 @@ To run the benchmark, we first need to set up the vector database (See [vectordb #### Running the Benchmark -In the execute page, click the `START BENCHMARK` button to execute the workload already configured. You may also want to check if all the configs are set correctly, see [here](./config/README.md) for the detailed explanation for different entries in the config file. +In the execute page, click the `START BENCHMARK` button to execute the workload already configured. You may also want to check if all the configs are set correctly, see [here](./config/README.md) for detailed explanation of different entries in the config file. ![config](./doc/figures/run.png) @@ -188,7 +188,7 @@ python3 src/run_new.py \ --msys-config config/monitor/example_config.yaml ``` -To customize your own workload setting, you may reference the provided config file within `config` folder. The detailed parameters are listed [here](config/README.md). +To customize your own workload setting, you may refer to the provided config file within `config` folder. The detailed parameters are listed [here](config/README.md). #### Performing Analysis @@ -198,10 +198,10 @@ You can check the output result within the `output` folder. To visualize the out ### Vector Databases -RAGPerf already integrates with many popular vector databases. To set up, check the detailed documentations at [VectorDB README](src/vectordb/README.md). +RAGPerf supports many popular vector databases. To set up, check the detailed documentations at [VectorDB README](src/vectordb/README.md). -Want to add a new DB? Check our RAGPerf API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database). This benchmark suit can automatically perform profiling and analysis on your desired vector database after implementing these APIs. +Want to add a new DB? Check our RAGPerf API at [VectorDB API](src/vectordb/README.md#adding-a-new-vector-database). This benchmark suit can automatically perform profiling and analysis on your vector database after implementing these APIs. ### Monitoring System -Examples of how to use it are documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md). +Examples of how to use the monitoring system are documented in `example/monitoring_sys_lib`. Detailed documentations at [MonitoringSystem README](monitoring_sys/README.md).