diff --git a/core/index.d.ts b/core/index.d.ts index 1a1407d524..77599c13cc 100644 --- a/core/index.d.ts +++ b/core/index.d.ts @@ -537,6 +537,7 @@ interface BaseCompletionOptions { mirostat?: number; stop?: string[]; maxTokens?: number; + numThreads?: number; } export interface ModelDescription { diff --git a/core/llm/llms/Ollama.ts b/core/llm/llms/Ollama.ts index 7ec3df69c4..054a636d82 100644 --- a/core/llm/llms/Ollama.ts +++ b/core/llm/llms/Ollama.ts @@ -120,6 +120,7 @@ class Ollama extends BaseLLM { stop: options.stop, num_ctx: this.contextLength, mirostat: options.mirostat, + num_thread: options.numThreads, }, }; diff --git a/docs/docs/reference/Model Providers/ollama.md b/docs/docs/reference/Model Providers/ollama.md index 6dc459334b..e036839eb7 100644 --- a/docs/docs/reference/Model Providers/ollama.md +++ b/docs/docs/reference/Model Providers/ollama.md @@ -8,10 +8,21 @@ { "title": "Ollama", "provider": "ollama", - "model": "llama2-7b" + "model": "llama2-7b", + "completionOptions": {} } ] } ``` +## Completion Options + +In addition to the model type, you can also configure some of the parameters that Ollama uses to run the model. + +- temperature: options.temperature - This is a parameter that controls the randomness of the generated text. Higher values result in more creative but potentially less coherent outputs, while lower values lead to more predictable and focused outputs. +- top_p: options.topP - This sets a threshold (between 0 and 1) to control how diverse the predicted tokens should be. The model generates tokens that are likely according to their probability distribution, but also considers the top-k most probable tokens. +- top_k: options.topK - This parameter limits the number of unique tokens to consider when generating the next token in the sequence. Higher values increase the variety of generated sequences, while lower values lead to more focused outputs. +- num_predict: options.maxTokens - This determines the maximum number of tokens (words or characters) to generate for the given input prompt. +- num_thread: options.numThreads - This is the multi-threading configuration option that controls how many threads the model uses for parallel processing. Higher values may lead to faster generation times but could also increase memory usage and complexity. Set this to one or two lower than the number of threads your CPU can handle to leave some for your GUI when running the model locally. + [View the source](https://github.com/continuedev/continue/blob/main/core/llm/llms/Ollama.ts)