Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3518,15 +3518,16 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
[](common_params &, const std::string &) { /* unused */ }
).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());

args.push_back(common_arg(
{"unsafe-allow-api-override"}, "PARAM1,PARAM2,...",
"allow overriding these params via /models/load endpoint (unsafe)",
[](common_params &, const std::string &) { /* unused */ }
).set_env(COMMON_ARG_PRESET_UNSAFE_ALLOW_API_OVERRIDE).set_preset_only());

// TODO:
// args.push_back(common_arg(
// {"pin"},
// "in server router mode, do not unload this model if models_max is exceeded",
// [](common_params &) { /* unused */ }
// ).set_preset_only());

// args.push_back(common_arg(
// {"unload-idle-seconds"}, "SECONDS",
// "in server router mode, unload models idle for more than this many seconds",
// [](common_params &, int) { /* unused */ }
// ).set_preset_only());
}
3 changes: 2 additions & 1 deletion common/arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
#include <cstring>

// pseudo-env variable to identify preset-only arguments
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
#define COMMON_ARG_PRESET_UNSAFE_ALLOW_API_OVERRIDE "__PRESET_UNSAFE_ALLOW_API_OVERRIDE"

//
// CLI argument parsing
Expand Down
41 changes: 25 additions & 16 deletions common/preset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,32 +236,41 @@ common_preset_context::common_preset_context(llama_example ex)
key_to_opt = get_map_key_opt(ctx_params);
}

common_preset common_preset_context::load_from_map(const std::map<std::string, std::string> & arg_map) const {
common_preset preset;
preset.name = COMMON_PRESET_DEFAULT_NAME;

for (const auto & [key, value] : arg_map) {
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
if (key_to_opt.find(key) != key_to_opt.end()) {
const auto & opt = key_to_opt.at(key);
if (is_bool_arg(opt)) {
preset.options[opt] = parse_bool_arg(opt, key, value);
} else {
preset.options[opt] = value;
}
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
} else {
LOG_WRN("ignoring unknown option: %s\n", key.c_str());
}
}

return preset;
}

common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
common_presets out;
auto ini_data = parse_ini_from_file(path);

for (auto section : ini_data) {
common_preset preset;
common_preset preset = load_from_map(section.second);

if (section.first.empty()) {
preset.name = COMMON_PRESET_DEFAULT_NAME;
} else {
preset.name = section.first;
}
LOG_DBG("loading preset: %s\n", preset.name.c_str());
for (const auto & [key, value] : section.second) {
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
if (key_to_opt.find(key) != key_to_opt.end()) {
const auto & opt = key_to_opt.at(key);
if (is_bool_arg(opt)) {
preset.options[opt] = parse_bool_arg(opt, key, value);
} else {
preset.options[opt] = value;
}
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
} else {
// TODO: maybe warn about unknown key?
}
}
LOG_DBG("loaded preset: %s\n", preset.name.c_str());

if (preset.name == "*") {
// handle global preset
Expand Down
4 changes: 4 additions & 0 deletions common/preset.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ struct common_preset_context {
// generate one preset from CLI arguments
common_preset load_from_args(int argc, char ** argv) const;

// generate one preset from mapping string to string
// key can be either arg name or env variable
common_preset load_from_map(const std::map<std::string, std::string> & arg_map) const;

// cascade multiple presets if exist on both: base < added
// if preset does not exist in base, it will be added without modification
common_presets cascade(const common_presets & base, const common_presets & added) const;
Expand Down
7 changes: 6 additions & 1 deletion tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1486,6 +1486,7 @@ The precedence rule for preset options is as follows:

We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
- `unsafe-allow-api-override` (string): Specifies which parameters can be overridden via the `/models/load` API endpoint. Accepts multiple values separated by commas. Example: `n-gpu-layers,jinja`. **Warning:** This feature is **unsafe** and must only be used in trusted environments.

### Routing requests

Expand Down Expand Up @@ -1571,11 +1572,15 @@ Load a model

Payload:
- `model`: name of the model to be loaded.
- `overrides`: list of preset parameter override (an object mapping string to string). Parameters must be whitelisted via the `unsafe-allow-api-override` preset parameter.

```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
"extra_args": ["-n", "128", "--top-k", "4"]
"overrides": {
"c": "1024",
"jinja": "false"
}
}
```

Expand Down
57 changes: 49 additions & 8 deletions tools/server/server-models.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <queue>
#include <filesystem>
#include <cstring>
#include <set>

#ifdef _WIN32
#include <winsock2.h>
Expand Down Expand Up @@ -244,7 +245,7 @@ void server_models::load_models() {
}
for (const auto & name : models_to_load) {
SRV_INF("(startup) loading model %s\n", name.c_str());
load(name);
load(name, {});
}
}

Expand Down Expand Up @@ -379,7 +380,7 @@ void server_models::unload_lru() {
}
}

void server_models::load(const std::string & name) {
std::vector<std::string> server_models::load(const std::string & name, const std::map<std::string, std::string> & override_params) {
if (!has_model(name)) {
throw std::runtime_error("model name=" + name + " is not found");
}
Expand All @@ -390,7 +391,7 @@ void server_models::load(const std::string & name) {
auto meta = mapping[name].meta;
if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
SRV_INF("model %s is not ready\n", name.c_str());
return;
return meta.args;
}

// prepare new instance info
Expand All @@ -404,12 +405,38 @@ void server_models::load(const std::string & name) {
throw std::runtime_error("failed to get a port number");
}

// prepare arguments
if (override_params.empty()) {
inst.meta.update_args(ctx_preset, bin_path); // render args
} else {
std::unordered_set<std::string> allowed_keys;
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_UNSAFE_ALLOW_API_OVERRIDE, val)) {
auto keys = string_split<std::string>(val, ',');
for (auto & key : keys) {
allowed_keys.insert(key);
}
}
common_preset orig_preset = inst.meta.preset; // copy
for (const auto & [key, value] : override_params) {
if (allowed_keys.find(key) != allowed_keys.end()) {
inst.meta.preset.set_option(ctx_preset, key, value);
} else {
throw std::invalid_argument(string_format(
"overriding option '%s' is not allowed for model '%s'",
key.c_str(),
name.c_str()
));
}
}
inst.meta.update_args(ctx_preset, bin_path); // render args
inst.meta.preset = orig_preset; // restore
}

inst.subproc = std::make_shared<subprocess_s>();
{
SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);

inst.meta.update_args(ctx_preset, bin_path); // render args

std::vector<std::string> child_args = inst.meta.args; // copy
std::vector<std::string> child_env = base_env; // copy
child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
Expand Down Expand Up @@ -484,8 +511,11 @@ void server_models::load(const std::string & name) {
}
}

auto args = inst.meta.args; // save args for return
mapping[name] = std::move(inst);
cv.notify_all();

return args;
}

static void interrupt_subprocess(FILE * stdin_file) {
Expand Down Expand Up @@ -565,7 +595,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
}
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
load(name);
load(name, {});
}

SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
Expand Down Expand Up @@ -743,8 +773,19 @@ void server_models_routes::init_routes() {
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
models.load(name);
res_ok(res, {{"success", true}});
std::map<std::string, std::string> overrides;
if (body.contains("overrides")) {
json overrides_json = body["overrides"];
for (auto it = overrides_json.begin(); it != overrides_json.end(); ++it) {
if (!it.value().is_string()) {
res_err(res, format_error_response("override values must be strings", ERROR_TYPE_INVALID_REQUEST));
return res;
}
overrides[it.key()] = it.value().get<std::string>();
}
}
auto args = models.load(name, overrides);
res_ok(res, {{"success", true}, {"args", args}});
return res;
};

Expand Down
3 changes: 2 additions & 1 deletion tools/server/server-models.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ struct server_models {

// load and unload model instances
// these functions are thread-safe
void load(const std::string & name);
// load() returns the argument list used to launch the model instance
std::vector<std::string> load(const std::string & name, const std::map<std::string, std::string> & override_params);
void unload(const std::string & name);
void unload_all();

Expand Down
Loading