From e1d7b43409436682a8adf77ff2b95ad7a7bcc9df Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 15:21:15 +0100 Subject: [PATCH 01/14] implement sleeping at queue level --- common/arg.cpp | 7 ++ common/common.h | 3 +- tools/server/server-context.cpp | 146 +++++++++++++++++++------------- tools/server/server-context.h | 2 +- tools/server/server-queue.cpp | 84 ++++++++++++++---- tools/server/server-queue.h | 51 +++++++++-- tools/server/server.cpp | 10 ++- 7 files changed, 217 insertions(+), 86 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 476bc0084a4..0f8ad61ffe2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2887,6 +2887,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.lora_init_without_apply = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--sleep-idle-seconds"}, "SECONDS", + string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds), + [](common_params & params, int value) { + params.sleep_idle_seconds = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", diff --git a/common/common.h b/common/common.h index 3e314f4c802..334372073a9 100644 --- a/common/common.h +++ b/common/common.h @@ -475,7 +475,8 @@ struct common_params { bool enable_chat_template = true; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; int reasoning_budget = -1; - bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response + bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response + int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time std::vector api_keys; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 68a5fd8ab08..494971856b9 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -544,7 +544,9 @@ struct server_context_impl { server_metrics metrics; - json webui_settings = json::object(); + // cached responses for HTTP API + json json_server_props = json::object(); + // json json_server_models = json::object(); // TODO // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; @@ -577,16 +579,6 @@ struct server_context_impl { params_base = params; - webui_settings = json::object(); - if (!params_base.webui_config_json.empty()) { - try { - webui_settings = json::parse(params_base.webui_config_json); - } catch (const std::exception & e) { - SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); - return false; - } - } - llama_init = common_init_from_params(params_base); model = llama_init->model(); @@ -703,7 +695,7 @@ struct server_context_impl { } // initialize slots and server-related data - void init() { + bool init() { // wiring up server queues queue_tasks.on_new_task([this](server_task && task) { process_single_task(std::move(task)); @@ -711,6 +703,10 @@ struct server_context_impl { queue_tasks.on_update_slots([this]() { update_slots(); }); + queue_tasks.on_sleeping_state([](bool sleeping) { + // TODO: handle sleeping state + SRV_INF("server queue is now %s\n", sleeping ? "sleeping" : "awake"); + }); // Necessary similarity of prompt for slot selection slot_prompt_similarity = params_base.slot_prompt_similarity; @@ -742,13 +738,13 @@ struct server_context_impl { slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft); if (slot.ctx_dft == nullptr) { SRV_ERR("%s", "failed to create draft context\n"); - return; + return false; } slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft); if (slot.spec == nullptr) { SRV_ERR("%s", "failed to create speculator\n"); - return; + return false; } for (auto & pair : params_base.speculative.replacements) { common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str()); @@ -832,6 +828,65 @@ struct server_context_impl { LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, common_chat_templates_source(chat_templates.get()), common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); + + if (!populate_json_responses()) { + SRV_ERR("%s", "failed to populate JSON responses\n"); + return false; + } + + return true; + } + + bool populate_json_responses() { + // populate webui settings + json json_webui_settings = json::object(); + { + if (!params_base.webui_config_json.empty()) { + try { + json_webui_settings = json::parse(params_base.webui_config_json); + } catch (const std::exception & e) { + SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + return false; + } + } + } + + // populate server properties + { + task_params params; + params.sampling = params_base.sampling; + json default_generation_settings_for_props = json { + {"params", params.to_json(true)}, + {"n_ctx", get_slot_n_ctx()}, + }; + + json_server_props = { + { "default_generation_settings", default_generation_settings_for_props }, + { "total_slots", params_base.n_parallel }, + { "model_alias", model_name }, + { "model_path", params_base.model.path }, + { "modalities", json { + {"vision", oai_parser_opt.allow_image}, + {"audio", oai_parser_opt.allow_audio}, + } }, + { "endpoint_slots", params_base.endpoint_slots }, + { "endpoint_props", params_base.endpoint_props }, + { "endpoint_metrics", params_base.endpoint_metrics }, + { "webui", params_base.webui }, + { "webui_settings", json_webui_settings }, + { "chat_template", common_chat_templates_source(chat_templates.get()) }, + { "bos_token", common_token_to_piece(ctx, llama_vocab_bos(vocab), /* special= */ true)}, + { "eos_token", common_token_to_piece(ctx, llama_vocab_eos(vocab), /* special= */ true)}, + { "build_info", build_info }, + }; + if (params_base.use_jinja) { + if (auto tool_use_src = common_chat_templates_source(chat_templates.get(), "tool_use")) { + json_server_props["chat_template_tool_use"] = tool_use_src; + } + } + } + + return true; } server_slot * get_slot_by_id(int id) { @@ -2662,8 +2717,8 @@ struct server_context_impl { server_context::server_context() : impl(new server_context_impl()) {} server_context::~server_context() = default; -void server_context::init() { - impl->init(); +bool server_context::init() { + return impl->init(); } bool server_context::load_model(const common_params & params) { @@ -2671,7 +2726,8 @@ bool server_context::load_model(const common_params & params) { } void server_context::start_loop() { - impl->queue_tasks.start_loop(); + auto & params = impl->params_base; + impl->queue_tasks.start_loop(params.sleep_idle_seconds * 1000); } void server_context::terminate() { @@ -2698,10 +2754,15 @@ server_context_info server_context::get_info() const { // generator-like API for HTTP response generation +// may have bypass_sleep = true if the task does not use ctx_server struct server_res_generator : server_http_res { server_response_reader rd; - server_res_generator(server_context_impl & ctx_server) - : rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) {} + server_res_generator(server_context_impl & ctx_server, bool bypass_sleep = false) + : rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) { + if (!bypass_sleep) { + ctx_server.queue_tasks.wait_until_no_sleep(); + } + } void ok(const json & response_data) { status = 200; data = safe_json_to_str(response_data); @@ -2933,7 +2994,7 @@ static std::unique_ptr handle_completions_impl( void server_routes::init_routes() { this->get_health = [this](const server_http_req &) { // error and loading states are handled by middleware - auto res = std::make_unique(ctx_server); + auto res = std::make_unique(ctx_server, true); res->ok({{"status", "ok"}}); return res; }; @@ -3115,46 +3176,10 @@ void server_routes::init_routes() { }; this->get_props = [this](const server_http_req &) { - auto res = std::make_unique(ctx_server); - json default_generation_settings_for_props; - - { - task_params params; - - params.sampling = ctx_server.params_base.sampling; - - default_generation_settings_for_props = json { - {"params", params.to_json(true)}, - {"n_ctx", ctx_server.get_slot_n_ctx()}, - }; - } - - json data = { - { "default_generation_settings", default_generation_settings_for_props }, - { "total_slots", ctx_server.params_base.n_parallel }, - { "model_alias", ctx_server.model_name }, - { "model_path", ctx_server.params_base.model.path }, - { "modalities", json { - {"vision", ctx_server.oai_parser_opt.allow_image}, - {"audio", ctx_server.oai_parser_opt.allow_audio}, - } }, - { "endpoint_slots", params.endpoint_slots }, - { "endpoint_props", params.endpoint_props }, - { "endpoint_metrics", params.endpoint_metrics }, - { "webui", params.webui }, - { "webui_settings", ctx_server.webui_settings }, - { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, - { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, - { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, - { "build_info", build_info }, - }; - if (ctx_server.params_base.use_jinja) { - if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) { - data["chat_template_tool_use"] = tool_use_src; - } - } - - res->ok(data); + auto res = std::make_unique(ctx_server, true); + auto props = ctx_server.json_server_props; + props["is_sleeping"] = ctx_server.queue_tasks.is_sleeping(); + res->ok(props); return res; }; @@ -3365,6 +3390,7 @@ void server_routes::init_routes() { return res; }; + // TODO: allow this endpoint to be accessed bypassing sleep mode, same method as get_props this->get_models = [this](const server_http_req &) { auto res = std::make_unique(ctx_server); json model_meta = nullptr; diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 230b25952e4..34a499f1ca7 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -23,7 +23,7 @@ struct server_context { ~server_context(); // initialize slots and server-related data - void init(); + bool init(); // load the model and initialize llama_context // returns true on success diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp index 3cceb2bbe21..835938bfc25 100644 --- a/tools/server/server-queue.cpp +++ b/tools/server/server-queue.cpp @@ -33,6 +33,7 @@ int server_queue::post(server_task && task, bool front) { } else { queue_tasks.push_back(std::move(task)); } + time_last_task = ggml_time_ms(); condition_tasks.notify_one(); return task_id; } @@ -54,6 +55,7 @@ int server_queue::post(std::vector && tasks, bool front) { queue_tasks.push_back(std::move(task)); } } + time_last_task = ggml_time_ms(); condition_tasks.notify_one(); return 0; } @@ -62,6 +64,7 @@ void server_queue::defer(server_task && task) { std::unique_lock lock(mutex_tasks); QUE_DBG("defer task, id = %d\n", task.id); queue_tasks_deferred.push_back(std::move(task)); + time_last_task = ggml_time_ms(); condition_tasks.notify_one(); } @@ -71,31 +74,52 @@ int server_queue::get_new_id() { return new_id; } -void server_queue::on_new_task(std::function callback) { - callback_new_task = std::move(callback); -} - -void server_queue::on_update_slots(std::function callback) { - callback_update_slots = std::move(callback); -} - void server_queue::pop_deferred_task() { std::unique_lock lock(mutex_tasks); if (!queue_tasks_deferred.empty()) { queue_tasks.emplace_front(std::move(queue_tasks_deferred.front())); queue_tasks_deferred.pop_front(); } + time_last_task = ggml_time_ms(); condition_tasks.notify_one(); } +void server_queue::wait_until_no_sleep() { + std::unique_lock lock(mutex_tasks); + if (!sleeping) { + return; + } else { + if (!req_stop_sleeping) { + QUE_DBG("%s", "requesting to stop sleeping\n"); + req_stop_sleeping = true; + condition_tasks.notify_one(); // only main thread is waiting on this + } + QUE_DBG("%s", "waiting until no sleep\n"); + condition_tasks.wait(lock, [&]{ + return !sleeping; + }); + } +} + void server_queue::terminate() { std::unique_lock lock(mutex_tasks); running = false; condition_tasks.notify_all(); } -void server_queue::start_loop() { +void server_queue::start_loop(int64_t idle_sleep_ms) { running = true; + time_last_task = ggml_time_ms(); + + constexpr auto max_wait_time = std::chrono::seconds(1); + auto should_sleep = [&]() -> bool { + // caller must hold mutex_tasks + if (idle_sleep_ms < 0) { + return false; + } + int64_t now = ggml_time_ms(); + return (now - time_last_task) >= idle_sleep_ms; + }; while (true) { QUE_DBG("%s", "processing new tasks\n"); @@ -117,23 +141,53 @@ void server_queue::start_loop() { QUE_DBG("processing task, id = %d\n", task.id); callback_new_task(std::move(task)); } - // all tasks in the current loop is processed, slots data is now ready QUE_DBG("%s", "update slots\n"); + // this will run the main inference process for all slots callback_update_slots(); + { + // update_slots() may take a while to finish, we need to make sure it's not counted as idle + std::unique_lock lock(mutex_tasks); + time_last_task = ggml_time_ms(); + } QUE_DBG("%s", "waiting for new tasks\n"); - { + while (true) { std::unique_lock lock(mutex_tasks); - if (!running) { - QUE_DBG("%s", "terminate\n"); - return; + if (!running || !queue_tasks.empty()) { + break; // go back to process new tasks or terminate } - if (queue_tasks.empty()) { + + // no tasks, check for sleeping state + if (should_sleep()) { + QUE_INF("%s", "entering sleeping state\n"); + sleeping = true; + callback_sleeping_state(true); + req_stop_sleeping = false; + // wait until we are requested to exit sleeping state condition_tasks.wait(lock, [&]{ + return (!running || req_stop_sleeping); + }); + if (!running) { // may changed during sleep + break; // terminate + } + QUE_INF("%s", "exiting sleeping state\n"); + req_stop_sleeping = false; + callback_sleeping_state(false); + sleeping = false; + time_last_task = ggml_time_ms(); + condition_tasks.notify_all(); // notify wait_until_no_sleep() + break; // process new tasks + } else { + // wait for new tasks or timeout for checking sleeping condition + bool res = condition_tasks.wait_for(lock, max_wait_time, [&]{ return (!queue_tasks.empty() || !running); }); + if (res) { + break; // new task arrived or terminate + } + // otherwise, loop again to check sleeping condition } } } diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index 8780d7fe129..66a90e50a66 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -12,7 +12,10 @@ struct server_queue { private: int id = 0; - bool running; + bool running = false; + bool sleeping = false; + bool req_stop_sleeping = false; + int64_t time_last_task = 0; // queues std::deque queue_tasks; @@ -24,6 +27,7 @@ struct server_queue { // callback functions std::function callback_new_task; std::function callback_update_slots; + std::function callback_sleeping_state; public: // Add a new task to the end of the queue @@ -38,15 +42,18 @@ struct server_queue { // Get the next id for creating a new task int get_new_id(); - // Register function to process a new task - void on_new_task(std::function callback); - - // Register the function to be called when all slots data is ready to be processed - void on_update_slots(std::function callback); - // Call when the state of one slot is changed, it will move one task from deferred to main queue void pop_deferred_task(); + // if sleeping, request exiting sleep state and wait until it is done + // returns immediately if not sleeping + void wait_until_no_sleep(); + + bool is_sleeping() { + std::unique_lock lock(mutex_tasks); + return sleeping; + } + // end the start_loop routine void terminate(); @@ -56,8 +63,15 @@ struct server_queue { * - Process the task (i.e. maybe copy data into slot) * - Check if multitask is finished * - Update all slots + * + * Sleeping procedure (disabled if idle_sleep_ms < 0): + * - If there is no task after idle_sleep_ms, enter sleeping state + * - Call callback_sleeping_state(true) + * - Wait until req_stop_sleeping is set to true + * - Call callback_sleeping_state(false) + * - Exit sleeping state */ - void start_loop(); + void start_loop(int64_t idle_sleep_ms = -1); // for metrics size_t queue_tasks_deferred_size() { @@ -65,6 +79,27 @@ struct server_queue { return queue_tasks_deferred.size(); } + // + // Functions below are not thread-safe, must only be used before start_loop() is called + // + + // Register function to process a new task + void on_new_task(std::function callback) { + callback_new_task = std::move(callback); + } + + // Register the function to be called when all slots data is ready to be processed + void on_update_slots(std::function callback) { + callback_update_slots = std::move(callback); + } + + // Register callback for sleeping state change + // note: when entering sleeping state, the callback is called AFTER sleeping is set to true + // when leaving sleeping state, the callback is called BEFORE sleeping is set to false + void on_sleeping_state(std::function callback) { + callback_sleeping_state = std::move(callback); + } + private: void cleanup_pending_task(int id_target); }; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b6b611b3f45..fa49ed94e9f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -252,7 +252,15 @@ int main(int argc, char ** argv, char ** envp) { return 1; } - ctx_server.init(); + if (!ctx_server.init()) { + clean_up(); + if (ctx_http.thread.joinable()) { + ctx_http.thread.join(); + } + LOG_ERR("%s: exiting due to server initialization error\n", __func__); + return 1; + } + ctx_http.is_ready.store(true); LOG_INF("%s: model loaded\n", __func__); From 197e5785ccdcf6d2089000fc69077e16a1236965 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 15:44:43 +0100 Subject: [PATCH 02/14] implement server-context suspend --- tools/cli/cli.cpp | 2 - tools/server/server-context.cpp | 71 +++++++++++++++++++++++---------- tools/server/server-context.h | 3 -- tools/server/server.cpp | 9 ----- 4 files changed, 49 insertions(+), 36 deletions(-) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 8a8639207b8..128679d020c 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -209,8 +209,6 @@ int main(int argc, char ** argv) { return 1; } - ctx_cli.ctx_server.init(); - console::spinner::stop(); console::log("\n"); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 494971856b9..9c0734f7cac 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -556,7 +556,15 @@ struct server_context_impl { common_chat_templates_ptr chat_templates; oaicompat_parser_options oai_parser_opt; + bool sleeping = false; + ~server_context_impl() { + if (!sleeping) { + destroy(); + } + } + + void destroy() { mtmd_free(mctx); // Clear any sampling context @@ -573,8 +581,39 @@ struct server_context_impl { llama_batch_free(batch); } + void handle_sleeping_state(bool new_state) { + GGML_ASSERT(sleeping != new_state); + if (new_state) { + SRV_INF("%s", "server is entering sleeping state\n"); + destroy(); + } else { + SRV_INF("%s", "server is exiting sleeping state\n"); + if (!load_model(params_base)) { + SRV_ERR("%s", "fatal: failed to reload model after sleeping\n"); + exit(1); + } + } + sleeping = new_state; + } + // load the model and initialize llama_context + // this may also be called to resume from sleeping state bool load_model(const common_params & params) { + bool is_resume = sleeping; + + if (!is_resume) { + // wiring up server queues + queue_tasks.on_new_task([this](server_task && task) { + process_single_task(std::move(task)); + }); + queue_tasks.on_update_slots([this]() { + update_slots(); + }); + queue_tasks.on_sleeping_state([this](bool sleeping) { + handle_sleeping_state(sleeping); + }); + } + SRV_INF("loading model '%s'\n", params.model.path.c_str()); params_base = params; @@ -646,7 +685,9 @@ struct server_context_impl { std::string & mmproj_path = params_base.mmproj.path; if (!mmproj_path.empty()) { - mtmd_helper_log_set(common_log_default_callback, nullptr); + if (!is_resume) { + mtmd_helper_log_set(common_log_default_callback, nullptr); + } mtmd_context_params mparams = mtmd_context_params_default(); mparams.use_gpu = params_base.mmproj_use_gpu; @@ -691,23 +732,6 @@ struct server_context_impl { } } - return true; - } - - // initialize slots and server-related data - bool init() { - // wiring up server queues - queue_tasks.on_new_task([this](server_task && task) { - process_single_task(std::move(task)); - }); - queue_tasks.on_update_slots([this]() { - update_slots(); - }); - queue_tasks.on_sleeping_state([](bool sleeping) { - // TODO: handle sleeping state - SRV_INF("server queue is now %s\n", sleeping ? "sleeping" : "awake"); - }); - // Necessary similarity of prompt for slot selection slot_prompt_similarity = params_base.slot_prompt_similarity; @@ -722,6 +746,7 @@ struct server_context_impl { n_ctx_slot = n_ctx_train; } + slots.clear(); for (int i = 0; i < params_base.n_parallel; i++) { server_slot slot; @@ -778,6 +803,12 @@ struct server_context_impl { batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); } + if (is_resume) { + return true; + } + + // everything below this line is only for fresh model load + metrics.init(); if (params_base.cache_ram_mib != 0) { @@ -2717,10 +2748,6 @@ struct server_context_impl { server_context::server_context() : impl(new server_context_impl()) {} server_context::~server_context() = default; -bool server_context::init() { - return impl->init(); -} - bool server_context::load_model(const common_params & params) { return impl->load_model(params); } diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 34a499f1ca7..74074c6a431 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -22,9 +22,6 @@ struct server_context { server_context(); ~server_context(); - // initialize slots and server-related data - bool init(); - // load the model and initialize llama_context // returns true on success bool load_model(const common_params & params); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index fa49ed94e9f..e323e4b2e66 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -252,15 +252,6 @@ int main(int argc, char ** argv, char ** envp) { return 1; } - if (!ctx_server.init()) { - clean_up(); - if (ctx_http.thread.joinable()) { - ctx_http.thread.join(); - } - LOG_ERR("%s: exiting due to server initialization error\n", __func__); - return 1; - } - ctx_http.is_ready.store(true); LOG_INF("%s: model loaded\n", __func__); From db3b78d21b125f7d7c1ce5a542cb243295d77f9c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 15:50:54 +0100 Subject: [PATCH 03/14] add test --- tools/server/tests/unit/test_sleep.py | 39 +++++++++++++++++++++++++++ tools/server/tests/utils.py | 3 +++ 2 files changed, 42 insertions(+) create mode 100644 tools/server/tests/unit/test_sleep.py diff --git a/tools/server/tests/unit/test_sleep.py b/tools/server/tests/unit/test_sleep.py new file mode 100644 index 00000000000..3374165e83e --- /dev/null +++ b/tools/server/tests/unit/test_sleep.py @@ -0,0 +1,39 @@ +import pytest +import time +from utils import * + +server = ServerPreset.tinyllama2() + + +@pytest.fixture(autouse=True) +def create_server(): + global server + server = ServerPreset.tinyllama2() + + +def test_server_sleep(): + global server + server.sleep_idle_seconds = 1 + server.start() + + # wait a bit so that server can go to sleep + time.sleep(2) + + # make sure these endpoints are still responsive after sleep + res = server.make_request("GET", "/health") + assert res.status_code == 200 + res = server.make_request("GET", "/props") + assert res.status_code == 200 + assert res.body["is_sleeping"] == True + + # make a generation request to wake up the server + res = server.make_request("POST", "/completion", data={ + "n_predict": 1, + "prompt": "Hello", + }) + assert res.status_code == 200 + + # it should no longer be sleeping + res = server.make_request("GET", "/props") + assert res.status_code == 200 + assert res.body["is_sleeping"] == False diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index 48e7403602f..f76bb1a9115 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -100,6 +100,7 @@ class ServerProcess: server_path: str | None = None mmproj_url: str | None = None media_path: str | None = None + sleep_idle_seconds: int | None = None # session variables process: subprocess.Popen | None = None @@ -230,6 +231,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--mmproj-url", self.mmproj_url]) if self.media_path: server_args.extend(["--media-path", self.media_path]) + if self.sleep_idle_seconds is not None: + server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds]) args = [str(arg) for arg in [server_path, *server_args]] print(f"tests: starting server with: {' '.join(args)}") From aea8f8c113279a3f7da3e1fa8f1ee566281c6bf0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 16:02:10 +0100 Subject: [PATCH 04/14] add docs --- tools/server/README.md | 10 ++++++++++ tools/server/server-queue.h | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/server/README.md b/tools/server/README.md index a67155c5028..71f1d4777ce 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1621,6 +1621,16 @@ Example of an error: } ``` +## Sleeping on Idle + +The server supports an automatic sleep mode that activates after a specified period of inactivity (no incoming tasks). This feature, introduced in [PR #18228](https://github.com/ggml-org/llama.cpp/pull/18228), can be enabled using the `--sleep-idle-seconds` command-line argument. It works seamlessly in both single-model and multi-model configurations. + +When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload. + +Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer: +- `GET /health` +- `GET /props` + ## More examples ### Interactive mode diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index 66a90e50a66..8ac37a20f6b 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -63,7 +63,7 @@ struct server_queue { * - Process the task (i.e. maybe copy data into slot) * - Check if multitask is finished * - Update all slots - * + * * Sleeping procedure (disabled if idle_sleep_ms < 0): * - If there is no task after idle_sleep_ms, enter sleeping state * - Call callback_sleeping_state(true) From 44a5a26c32ec7d41706680a4d2ca1f268c9166a2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 16:13:57 +0100 Subject: [PATCH 05/14] optimization: add fast path --- common/arg.cpp | 3 +++ tools/server/server-context.cpp | 2 ++ 2 files changed, 5 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 0f8ad61ffe2..13020654982 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2891,6 +2891,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--sleep-idle-seconds"}, "SECONDS", string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds), [](common_params & params, int value) { + if (value == 0 || value < -1) { + throw std::invalid_argument("invalid value: cannot be 0 or less than -1"); + } params.sleep_idle_seconds = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 9c0734f7cac..ab7df5e4b14 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2786,6 +2786,8 @@ struct server_res_generator : server_http_res { server_response_reader rd; server_res_generator(server_context_impl & ctx_server, bool bypass_sleep = false) : rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) { + // fast path in case sleeping is disabled + bypass_sleep |= ctx_server.params_base.sleep_idle_seconds < 0; if (!bypass_sleep) { ctx_server.queue_tasks.wait_until_no_sleep(); } From e6ab62c4a1af57fcba5c982ced53c2522b9517d1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 19:09:20 +0100 Subject: [PATCH 06/14] make sure to free llama_init --- tools/server/server-context.cpp | 5 +++++ tools/server/server-context.h | 2 +- tools/server/server.cpp | 6 +++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ab7df5e4b14..d39b425209b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -565,7 +565,12 @@ struct server_context_impl { } void destroy() { + llama_init.reset(); + ctx = nullptr; + model = nullptr; + mtmd_free(mctx); + mctx = nullptr; // Clear any sampling context for (server_slot & slot : slots) { diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 74074c6a431..a56be7b8e7e 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -32,7 +32,7 @@ struct server_context { // terminate main loop (will unblock start_loop) void terminate(); - // get the underlaying llama_context + // get the underlaying llama_context, can return nullptr if sleeping llama_context * get_llama_context() const; // get a new response reader, used by CLI application diff --git a/tools/server/server.cpp b/tools/server/server.cpp index e323e4b2e66..ff650ab2ec1 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -308,7 +308,11 @@ int main(int argc, char ** argv, char ** envp) { if (monitor_thread.joinable()) { monitor_thread.join(); } - llama_memory_breakdown_print(ctx_server.get_llama_context()); + + auto * ll_ctx = ctx_server.get_llama_context(); + if (ll_ctx != nullptr) { + llama_memory_breakdown_print(ll_ctx); + } } return 0; From 937b06415d0d0f46a171b67f018f1b15267ac8d5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 19:31:09 +0100 Subject: [PATCH 07/14] nits --- tools/server/server-context.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index d39b425209b..cb86d17ed6d 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -560,6 +560,8 @@ struct server_context_impl { ~server_context_impl() { if (!sleeping) { + // destroy() is already called when entering sleeping state + // we don't call it again here to avoid double free destroy(); } } @@ -594,8 +596,7 @@ struct server_context_impl { } else { SRV_INF("%s", "server is exiting sleeping state\n"); if (!load_model(params_base)) { - SRV_ERR("%s", "fatal: failed to reload model after sleeping\n"); - exit(1); + GGML_ABORT("failed to reload model after sleeping"); } } sleeping = new_state; From 105e2f3cf67cf726e9ea9761faf4c1bdd824370d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 19:38:20 +0100 Subject: [PATCH 08/14] fix use-after-free --- tools/server/server-context.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index cb86d17ed6d..335bb27cb8a 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -809,14 +809,11 @@ struct server_context_impl { batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); } - if (is_resume) { - return true; + // preserve metric state across resumes + if (!is_resume) { + metrics.init(); } - // everything below this line is only for fresh model load - - metrics.init(); - if (params_base.cache_ram_mib != 0) { if (params_base.cache_ram_mib < 0) { SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit"); From fd09f8800439b3397ba035c9db7cbb1e57293bc3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 20:02:06 +0100 Subject: [PATCH 09/14] allow /models to be accessed during sleeping, fix use-after-free --- tools/server/README.md | 1 + tools/server/server-context.cpp | 53 ++++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index 71f1d4777ce..c772afe0058 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1630,6 +1630,7 @@ When the server enters sleep mode, the model and its associated memory (includin Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer: - `GET /health` - `GET /props` +- `GET /models` and `GET /v1/models` ## More examples diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 335bb27cb8a..f0b4a1e91ed 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -544,9 +544,9 @@ struct server_context_impl { server_metrics metrics; - // cached responses for HTTP API - json json_server_props = json::object(); - // json json_server_models = json::object(); // TODO + // cached responses for HTTP API (read-only from HTTP threads) + json json_server_props = json::object(); + json json_server_model_meta = json::object(); // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; @@ -920,6 +920,18 @@ struct server_context_impl { } } + // populate model metadata + { + json_server_model_meta = { + {"vocab_type", llama_vocab_type (vocab)}, + {"n_vocab", llama_vocab_n_tokens (vocab)}, + {"n_ctx_train", llama_model_n_ctx_train(model)}, + {"n_embd", llama_model_n_embd (model)}, + {"n_params", llama_model_n_params (model)}, + {"size", llama_model_size (model)}, + }; + } + return true; } @@ -2724,17 +2736,6 @@ struct server_context_impl { SRV_DBG("%s", "run slots completed\n"); } - json model_meta() const { - return json { - {"vocab_type", llama_vocab_type (vocab)}, - {"n_vocab", llama_vocab_n_tokens (vocab)}, - {"n_ctx_train", llama_model_n_ctx_train(model)}, - {"n_embd", llama_model_n_embd (model)}, - {"n_params", llama_model_n_params (model)}, - {"size", llama_model_size (model)}, - }; - } - int get_slot_n_ctx() { return slots.back().n_ctx; } @@ -2812,6 +2813,7 @@ struct server_res_generator : server_http_res { // static std::unique_ptr handle_completions_impl( + std::unique_ptr && res_ptr, server_context_impl & ctx_server, server_task_type type, const json & data, @@ -2820,7 +2822,7 @@ static std::unique_ptr handle_completions_impl( task_response_type res_type) { GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); - auto res = std::make_unique(ctx_server); + auto res = std::move(res_ptr); auto completion_id = gen_chatcmplid(); auto & rd = res->rd; @@ -3024,6 +3026,9 @@ static std::unique_ptr handle_completions_impl( } void server_routes::init_routes() { + // IMPORTANT: all lambda functions must start with std::make_unique + // this is to ensure that the server_res_generator can handle sleeping case correctly + this->get_health = [this](const server_http_req &) { // error and loading states are handled by middleware auto res = std::make_unique(ctx_server, true); @@ -3329,6 +3334,7 @@ void server_routes::init_routes() { std::vector files; // dummy return handle_completions_impl( + std::move(res), ctx_server, SERVER_TASK_TYPE_INFILL, data, @@ -3338,9 +3344,11 @@ void server_routes::init_routes() { }; this->post_completions = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); std::vector files; // dummy const json body = json::parse(req.body); return handle_completions_impl( + std::move(res), ctx_server, SERVER_TASK_TYPE_COMPLETION, body, @@ -3350,9 +3358,11 @@ void server_routes::init_routes() { }; this->post_completions_oai = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); std::vector files; // dummy const json body = json::parse(req.body); return handle_completions_impl( + std::move(res), ctx_server, SERVER_TASK_TYPE_COMPLETION, body, @@ -3362,6 +3372,7 @@ void server_routes::init_routes() { }; this->post_chat_completions = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); std::vector files; json body = json::parse(req.body); json body_parsed = oaicompat_chat_params_parse( @@ -3369,6 +3380,7 @@ void server_routes::init_routes() { ctx_server.oai_parser_opt, files); return handle_completions_impl( + std::move(res), ctx_server, SERVER_TASK_TYPE_COMPLETION, body_parsed, @@ -3378,6 +3390,7 @@ void server_routes::init_routes() { }; this->post_anthropic_messages = [this](const server_http_req & req) { + auto res = std::make_unique(ctx_server); std::vector files; json body = convert_anthropic_to_oai(json::parse(req.body)); json body_parsed = oaicompat_chat_params_parse( @@ -3385,6 +3398,7 @@ void server_routes::init_routes() { ctx_server.oai_parser_opt, files); return handle_completions_impl( + std::move(res), ctx_server, SERVER_TASK_TYPE_COMPLETION, body_parsed, @@ -3422,14 +3436,17 @@ void server_routes::init_routes() { return res; }; - // TODO: allow this endpoint to be accessed bypassing sleep mode, same method as get_props this->get_models = [this](const server_http_req &) { - auto res = std::make_unique(ctx_server); + auto res = std::make_unique(ctx_server, true); json model_meta = nullptr; if (is_ready()) { - model_meta = ctx_server.model_meta(); + model_meta = ctx_server.json_server_model_meta; } bool has_mtmd = ctx_server.mctx != nullptr; + + // IMPORTANT: this endpoint can be accessed on model loading and in sleeping mode + // do NOT access dynamic model info that requires calling libllama APIs + json models = { {"models", { { From 0bb9bc486a929d55db7c5dd0ecd787707c067d3f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 20:13:06 +0100 Subject: [PATCH 10/14] don't allow accessing /models during sleep, it is not thread-safe --- tools/server/README.md | 1 - tools/server/server-context.cpp | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index c772afe0058..71f1d4777ce 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1630,7 +1630,6 @@ When the server enters sleep mode, the model and its associated memory (includin Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer: - `GET /health` - `GET /props` -- `GET /models` and `GET /v1/models` ## More examples diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index f0b4a1e91ed..3c07c825927 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3436,8 +3436,10 @@ void server_routes::init_routes() { return res; }; + // TODO: this endpoint is unsafe to access during model reloading (i.e. wake up from sleeping) + // how to make it work even during load_model()? this->get_models = [this](const server_http_req &) { - auto res = std::make_unique(ctx_server, true); + auto res = std::make_unique(ctx_server); json model_meta = nullptr; if (is_ready()) { model_meta = ctx_server.json_server_model_meta; From d8500827783c7125d627dd296f531281fca8cfec Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 20:21:42 +0100 Subject: [PATCH 11/14] fix data race on accessing props and model_meta --- tools/server/server-context.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 3c07c825927..cd4ef26ac95 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -863,9 +863,12 @@ struct server_context_impl { common_chat_templates_source(chat_templates.get()), common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); - if (!populate_json_responses()) { - SRV_ERR("%s", "failed to populate JSON responses\n"); - return false; + if (!is_resume) { + // do not repopulate on resume, as HTTP threads may be still using the existing JSON data + if (!populate_json_responses()) { + SRV_ERR("%s", "failed to populate JSON responses\n"); + return false; + } } return true; From 1663d2f8f7676fa7688fe32a6333d547de278d17 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 21:51:59 +0100 Subject: [PATCH 12/14] small clean up --- tools/server/README-dev.md | 2 ++ tools/server/server-context.cpp | 51 ++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index fbcd6bc1f93..3fea3042f72 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -107,6 +107,8 @@ For detailed instructions, see the [test documentation](./tests/README.md). - Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362 - Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470 - Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808 +- INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169) +- Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228 diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index cd4ef26ac95..4f5f14095ee 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -607,19 +607,6 @@ struct server_context_impl { bool load_model(const common_params & params) { bool is_resume = sleeping; - if (!is_resume) { - // wiring up server queues - queue_tasks.on_new_task([this](server_task && task) { - process_single_task(std::move(task)); - }); - queue_tasks.on_update_slots([this]() { - update_slots(); - }); - queue_tasks.on_sleeping_state([this](bool sleeping) { - handle_sleeping_state(sleeping); - }); - } - SRV_INF("loading model '%s'\n", params.model.path.c_str()); params_base = params; @@ -809,11 +796,6 @@ struct server_context_impl { batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); } - // preserve metric state across resumes - if (!is_resume) { - metrics.init(); - } - if (params_base.cache_ram_mib != 0) { if (params_base.cache_ram_mib < 0) { SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit"); @@ -864,11 +846,34 @@ struct server_context_impl { common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); if (!is_resume) { - // do not repopulate on resume, as HTTP threads may be still using the existing JSON data - if (!populate_json_responses()) { - SRV_ERR("%s", "failed to populate JSON responses\n"); - return false; - } + return init(); + } + + return true; + } + + // unlike load_model(), this is only called once during initialization + bool init() { + GGML_ASSERT(ctx != nullptr); + GGML_ASSERT(model != nullptr); + GGML_ASSERT(!sleeping); + + // wiring up server queues + queue_tasks.on_new_task([this](server_task && task) { + process_single_task(std::move(task)); + }); + queue_tasks.on_update_slots([this]() { + update_slots(); + }); + queue_tasks.on_sleeping_state([this](bool sleeping) { + handle_sleeping_state(sleeping); + }); + + metrics.init(); + + if (!populate_json_responses()) { + SRV_ERR("%s", "failed to populate JSON responses\n"); + return false; } return true; From b51da9a18fb837fb39f085742cc721851ca6c6ea Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 21:54:16 +0100 Subject: [PATCH 13/14] trailing whitespace --- tools/server/server-context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 4f5f14095ee..4751d921bfe 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -851,7 +851,7 @@ struct server_context_impl { return true; } - + // unlike load_model(), this is only called once during initialization bool init() { GGML_ASSERT(ctx != nullptr); From 06a5ebe187caa458ea8f1d865e744e8b31f58ef4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 20 Dec 2025 22:18:41 +0100 Subject: [PATCH 14/14] rm outdated comments --- tools/server/server-context.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 4751d921bfe..cde34e6533c 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3453,10 +3453,6 @@ void server_routes::init_routes() { model_meta = ctx_server.json_server_model_meta; } bool has_mtmd = ctx_server.mctx != nullptr; - - // IMPORTANT: this endpoint can be accessed on model loading and in sleeping mode - // do NOT access dynamic model info that requires calling libllama APIs - json models = { {"models", { {