Skip to content

Commit

Permalink
feat: new CLI options
Browse files Browse the repository at this point in the history
Signed-off-by: Xin Liu <[email protected]>
  • Loading branch information
apepkuss committed Aug 13, 2024
1 parent bc3676d commit 9c9aafc
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 10 deletions.
6 changes: 0 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ version = "0.8.2"
edition = "2021"

[dependencies]
endpoints = { version = "=0.12.0" }
chat-prompts = { version = "=0.11.1" }
llama-core = { version = "=0.14.1", features = ["logging"] }
endpoints = { version = "=0.12.0", path = "/Volumes/Dev/secondstate/me/LlamaEdge/api-server/endpoints" }
chat-prompts = { version = "=0.11.1", path = "/Volumes/Dev/secondstate/me/LlamaEdge/api-server/chat-prompts" }
llama-core = { version = "=0.14.1", features = [
"logging",
], path = "/Volumes/Dev/secondstate/me/LlamaEdge/api-server/llama-core" }
futures = { version = "0.3.6", default-features = false, features = ["async-await", "std"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,12 @@ To check the CLI options of the `rag-api-server` wasm app, you can run the follo
The main GPU to use
--tensor-split <TENSOR_SPLIT>
How split tensors should be distributed accross GPUs. If None the model is not split; otherwise, a comma-separated list of non-negative values, e.g., "3,2" presents 60% of the data to GPU 0 and 40% to GPU 1
--threads <THREADS>
Number of threads to use during computation [default: 2]
--grammar <GRAMMAR>
BNF-like grammar to constrain generations (see samples in grammars/ dir) [default: ]
--json-schema <JSON_SCHEMA>
JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object. For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
-b, --batch-size <BATCH_SIZE>
Sets batch sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--batch-size 128,64'. The first value is for the chat model, and the second is for the embedding model [default: 512,512]
--rag-prompt <RAG_PROMPT>
Expand Down
30 changes: 29 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,15 @@ struct Cli {
/// How split tensors should be distributed accross GPUs. If None the model is not split; otherwise, a comma-separated list of non-negative values, e.g., "3,2" presents 60% of the data to GPU 0 and 40% to GPU 1.
#[arg(long)]
tensor_split: Option<String>,
/// Number of threads to use during computation
#[arg(long, default_value = "2")]
threads: u64,
/// BNF-like grammar to constrain generations (see samples in grammars/ dir).
#[arg(long, default_value = "")]
pub grammar: String,
/// JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object. For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead.
#[arg(long)]
pub json_schema: Option<String>,
/// Sets batch sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--batch-size 128,64'. The first value is for the chat model, and the second is for the embedding model.
#[arg(short, long, value_delimiter = ',', default_value = "512,512", value_parser = clap::value_parser!(u64))]
batch_size: Vec<u64>,
Expand Down Expand Up @@ -222,6 +231,19 @@ async fn main() -> Result<(), ServerError> {
info!(target: "server_config", "tensor_split: {}", tensor_split);
}

// log threads
info!(target: "stdout", "threads: {}", cli.threads);

// log grammar
if !cli.grammar.is_empty() {
info!(target: "stdout", "grammar: {}", &cli.grammar);
}

// log json schema
if let Some(json_schema) = &cli.json_schema {
info!(target: "stdout", "json_schema: {}", json_schema);
}

// log rag prompt
if let Some(rag_prompt) = &cli.rag_prompt {
info!(target: "server_config", "rag_prompt: {}", rag_prompt);
Expand Down Expand Up @@ -290,7 +312,10 @@ async fn main() -> Result<(), ServerError> {
.with_n_predict(cli.n_predict)
.with_n_gpu_layers(cli.n_gpu_layers)
.with_main_gpu(cli.main_gpu)
.with_tensor_split(cli.tensor_split)
.with_tensor_split(cli.tensor_split.clone())
.with_threads(cli.threads)
.with_grammar(cli.grammar)
.with_json_schema(cli.json_schema)
.enable_plugin_log(true)
.enable_debug_log(plugin_debug)
.build();
Expand Down Expand Up @@ -322,6 +347,9 @@ async fn main() -> Result<(), ServerError> {
)
.with_ctx_size(cli.ctx_size[1])
.with_batch_size(cli.batch_size[1])
.with_main_gpu(cli.main_gpu)
.with_tensor_split(cli.tensor_split)
.with_threads(cli.threads)
.enable_plugin_log(true)
.enable_debug_log(plugin_debug)
.build();
Expand Down

0 comments on commit 9c9aafc

Please sign in to comment.