diff --git a/src/features.rs b/src/features.rs index 3001beb4..1f20a6d4 100644 --- a/src/features.rs +++ b/src/features.rs @@ -109,24 +109,17 @@ mod tests { use super::*; use meilisearch_test_macro::meilisearch_test; - #[meilisearch_test] - async fn test_experimental_features_get(client: Client) { - let mut features = ExperimentalFeatures::new(&client); - features.set_vector_store(false); - let _ = features.update().await.unwrap(); - - let res = features.get().await.unwrap(); - - assert!(!res.vector_store); - } - + /// there is purposely no test which disables this feature to prevent impact on other testcases + /// the setting is shared amongst all indexes #[meilisearch_test] async fn test_experimental_features_enable_vector_store(client: Client) { let mut features = ExperimentalFeatures::new(&client); features.set_vector_store(true); let res = features.update().await.unwrap(); + assert!(res.vector_store); + let res = features.get().await.unwrap(); assert!(res.vector_store); } } diff --git a/src/search.rs b/src/search.rs index 824db05c..da3f7ef2 100644 --- a/src/search.rs +++ b/src/search.rs @@ -142,6 +142,30 @@ pub enum Selectors { All, } +/// Setting whether to utilise previously defined embedders for semantic searching +#[derive(Debug, Serialize, Clone)] +#[serde(rename_all = "camelCase")] +pub struct HybridSearch<'a> { + /// Indicates one of the embedders configured for the queried index + /// + /// **Default: `"default"`** + pub embedder: &'a str, + /// number between `0` and `1`: + /// - `0.0` indicates full keyword search + /// - `1.0` indicates full semantic search + /// + /// **Default: `0.5`** + pub semantic_ratio: f32, +} +impl Default for HybridSearch{ + fn default() -> Self { + HybridSearch{ + embedder: "default", + semantic_ratio: 0.5, + } + } +} + type AttributeToCrop<'a> = (&'a str, Option); /// A struct representing a query. @@ -350,6 +374,20 @@ pub struct SearchQuery<'a, Http: HttpClient> { #[serde(skip_serializing_if = "Option::is_none")] pub(crate) index_uid: Option<&'a str>, + + /// Defines whether to utilise previously defined embedders for semantic searching + #[serde(skip_serializing_if = "Option::is_none")] + pub hybrid: Option>, + + /// Defines what vectors an userprovided embedder has gotten for semantic searching + #[serde(skip_serializing_if = "Option::is_none")] + pub vector: Option<&'a [f32]>, + + /// Defines whether vectors for semantic searching are returned in the search results + /// + /// Can Significantly increase the response size. + #[serde(skip_serializing_if = "Option::is_none")] + pub retrieve_vectors: Option, } #[allow(missing_docs)] @@ -379,6 +417,9 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { show_ranking_score_details: None, matching_strategy: None, index_uid: None, + hybrid: None, + vector: None, + retrieve_vectors: None, distinct: None, ranking_score_threshold: None, locales: None, @@ -474,6 +515,16 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.filter = Some(Filter::new(Either::Right(filter))); self } + /// Defines whether vectors for semantic searching are returned in the search results + /// + /// Can Significantly increase the response size. + pub fn with_retrieve_vectors<'b>( + &'b mut self, + retrieve_vectors: bool, + ) -> &'b mut SearchQuery<'a, Http> { + self.retrieve_vectors = Some(retrieve_vectors); + self + } pub fn with_facets<'b>( &'b mut self, facets: Selectors<&'a [&'a str]>, @@ -574,6 +625,23 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.index_uid = Some(&self.index.uid); self } + /// Defines whether to utilise previously defined embedders for semantic searching + pub fn with_hybrid<'b>( + &'b mut self, + embedder: &'a str, + semantic_ratio: f32, + ) -> &'b mut SearchQuery<'a, Http> { + self.hybrid = Some(HybridSearch { + embedder, + semantic_ratio, + }); + self + } + /// Defines what vectors an userprovided embedder has gotten for semantic searching + pub fn with_vector<'b>(&'b mut self, vector: &'a [f32]) -> &'b mut SearchQuery<'a, Http> { + self.vector = Some(vector); + self + } pub fn with_distinct<'b>(&'b mut self, distinct: &'a str) -> &'b mut SearchQuery<'a, Http> { self.distinct = Some(distinct); self @@ -675,6 +743,36 @@ mod tests { kind: String, number: i32, nested: Nested, + #[serde(skip_serializing_if = "Option::is_none", default)] + _vectors: Option, + } + + #[derive(Debug, Serialize, Deserialize, PartialEq)] + struct Vector { + embeddings: SingleOrMultipleVectors, + regenerate: bool, + } + + #[derive(Serialize, Deserialize, Debug, PartialEq)] + #[serde(untagged)] + enum SingleOrMultipleVectors { + Single(Vec), + Multiple(Vec>), + } + + #[derive(Debug, Serialize, Deserialize, PartialEq)] + struct Vectors(HashMap); + + impl From<&[f32; 1]> for Vectors { + fn from(value: &[f32; 1]) -> Self { + Vectors(HashMap::from([( + S("default"), + Vector { + embeddings: SingleOrMultipleVectors::Multiple(Vec::from([value.to_vec()])), + regenerate: false, + }, + )])) + } } impl PartialEq> for Document { @@ -688,16 +786,16 @@ mod tests { async fn setup_test_index(client: &Client, index: &Index) -> Result<(), Error> { let t0 = index.add_documents(&[ - Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") } }, - Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") } }, - Document { id: 2, kind: "title".into(), number: 20, value: S("The Social Network"), nested: Nested { child: S("third") } }, - Document { id: 3, kind: "title".into(), number: 30, value: S("Harry Potter and the Sorcerer's Stone"), nested: Nested { child: S("fourth") } }, - Document { id: 4, kind: "title".into(), number: 40, value: S("Harry Potter and the Chamber of Secrets"), nested: Nested { child: S("fift") } }, - Document { id: 5, kind: "title".into(), number: 50, value: S("Harry Potter and the Prisoner of Azkaban"), nested: Nested { child: S("sixth") } }, - Document { id: 6, kind: "title".into(), number: 60, value: S("Harry Potter and the Goblet of Fire"), nested: Nested { child: S("seventh") } }, - Document { id: 7, kind: "title".into(), number: 70, value: S("Harry Potter and the Order of the Phoenix"), nested: Nested { child: S("eighth") } }, - Document { id: 8, kind: "title".into(), number: 80, value: S("Harry Potter and the Half-Blood Prince"), nested: Nested { child: S("ninth") } }, - Document { id: 9, kind: "title".into(), number: 90, value: S("Harry Potter and the Deathly Hallows"), nested: Nested { child: S("tenth") } }, + Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") }, _vectors: Some(Vectors::from(&[1000.0]))}, + Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") }, _vectors: Some(Vectors::from(&[2000.0])) }, + Document { id: 2, kind: "title".into(), number: 20, value: S("The Social Network"), nested: Nested { child: S("third") }, _vectors: Some(Vectors::from(&[3000.0])) }, + Document { id: 3, kind: "title".into(), number: 30, value: S("Harry Potter and the Sorcerer's Stone"), nested: Nested { child: S("fourth") }, _vectors: Some(Vectors::from(&[4000.0])) }, + Document { id: 4, kind: "title".into(), number: 40, value: S("Harry Potter and the Chamber of Secrets"), nested: Nested { child: S("fift") }, _vectors: Some(Vectors::from(&[5000.0])) }, + Document { id: 5, kind: "title".into(), number: 50, value: S("Harry Potter and the Prisoner of Azkaban"), nested: Nested { child: S("sixth") }, _vectors: Some(Vectors::from(&[6000.0])) }, + Document { id: 6, kind: "title".into(), number: 60, value: S("Harry Potter and the Goblet of Fire"), nested: Nested { child: S("seventh") }, _vectors: Some(Vectors::from(&[7000.0])) }, + Document { id: 7, kind: "title".into(), number: 70, value: S("Harry Potter and the Order of the Phoenix"), nested: Nested { child: S("eighth") }, _vectors: Some(Vectors::from(&[8000.0])) }, + Document { id: 8, kind: "title".into(), number: 80, value: S("Harry Potter and the Half-Blood Prince"), nested: Nested { child: S("ninth") }, _vectors: Some(Vectors::from(&[9000.0])) }, + Document { id: 9, kind: "title".into(), number: 90, value: S("Harry Potter and the Deathly Hallows"), nested: Nested { child: S("tenth") }, _vectors: Some(Vectors::from(&[10000.0])) }, ], None).await?; let t1 = index .set_filterable_attributes(["kind", "value", "number"]) @@ -785,7 +883,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("second") } + nested: Nested { child: S("second") }, + _vectors: Some(Vectors::from(&[2000.0])), }, &results.hits[0].result ); @@ -957,7 +1056,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -972,7 +1072,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -993,7 +1094,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap()); @@ -1008,7 +1110,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1033,7 +1136,8 @@ mod tests { value: S("(ꈍᴗꈍ)sed do eiusmod tempor incididunt ut(ꈍᴗꈍ)"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1060,7 +1164,8 @@ mod tests { value: S("The (⊃。•́‿•̀。)⊃ Social ⊂(´• ω •`⊂) Network"), kind: S("title"), number: 20, - nested: Nested { child: S("third") } + nested: Nested { child: S("third") }, + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1082,7 +1187,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("first") } + nested: Nested { child: S("second") }, + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap(), ); @@ -1097,7 +1203,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("first") } + nested: Nested { child: S("second") }, + _vectors: None, }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1295,4 +1402,77 @@ mod tests { Ok(()) } + + /// enable vector searching and configure an userProvided embedder + async fn setup_hybrid_searching(client: &Client, index: &Index) -> Result<(), Error> { + use crate::settings::{Embedder, UserProvidedEmbedderSettings}; + let embedder_setting = + Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); + let t3 = index + .set_settings(&crate::settings::Settings { + embedders: Some(HashMap::from([("default".to_string(), embedder_setting)])), + ..crate::settings::Settings::default() + }) + .await?; + t3.wait_for_completion(&client, None, None).await?; + Ok(()) + } + + #[meilisearch_test] + async fn test_with_vectors(client: Client, index: Index) -> Result<(), Error> { + setup_hybrid_searching(&client, &index).await?; + setup_test_index(&client, &index).await?; + + let results: SearchResults = index + .search() + .with_query("lorem ipsum") + .with_retrieve_vectors(true) + .execute() + .await?; + assert_eq!(results.hits.len(), 1); + let expected = Vectors::from(&[1000.0]); + assert_eq!(results.hits[0].result._vectors, Some(expected)); + + let results: SearchResults = index + .search() + .with_query("lorem ipsum") + .with_retrieve_vectors(false) + .execute() + .await?; + assert_eq!(results.hits.len(), 1); + assert_eq!(results.hits[0].result._vectors, None); + Ok(()) + } + + #[tokio::test] + async fn test_hybrid() -> Result<(), Error> { + // this is mocked as I could not get the hybrid searching to work + // See https://github.com/meilisearch/meilisearch-rust/pull/554 for further context + let mut s = mockito::Server::new_async().await; + let mock_server_url = s.url(); + let client = Client::new(mock_server_url, None::)?; + let index = client.index("mocked_index"); + + let req = r#"{"q":"hello hybrid searching","hybrid":{"embedder":"default","semanticRatio":0.0},"vector":[1000.0]}"#.to_string(); + let response = r#"{"hits":[],"offset":null,"limit":null,"estimatedTotalHits":null,"page":null,"hitsPerPage":null,"totalHits":null,"totalPages":null,"facetDistribution":null,"facetStats":null,"processingTimeMs":0,"query":"","indexUid":null}"#.to_string(); + let mock_res = s + .mock("POST", "/indexes/mocked_index/search") + .with_status(200) + .match_body(mockito::Matcher::Exact(req)) + .with_body(&response) + .expect(1) + .create_async() + .await; + let results: Result, Error> = index + .search() + .with_query("hello hybrid searching") + .with_hybrid("default", 0.0) + .with_vector(&[1000.0]) + .execute() + .await; + mock_res.assert_async().await; + results?; // purposely not done above to have better debugging output + + Ok(()) + } } diff --git a/src/settings.rs b/src/settings.rs index 003ae4c2..2c55daa9 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -36,6 +36,303 @@ pub struct FacetingSettings { pub max_values_per_facet: usize, } +/// Allows configuring semantic searching +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase", tag = "source")] +pub enum Embedder { + /// Compute embeddings inside meilisearch with models from [HuggingFace](https://huggingface.co/). + /// You may be able to significantly improve performance by [compiling a CUDA-compatible Meilisearch binary](https://www.meilisearch.com/docs/guides/ai/computing_hugging_face_embeddings_gpu). + /// This is a resource-intensive operation and might affect indexing performance negatively. + HuggingFace(HuggingFaceEmbedderSettings), + /// Use OpenAI's API to generate embeddings + /// Depending on hardware, this is a + OpenAI(OpenAIEmbedderSettings), + /// [Ollama](https://ollama.com/) is a framework for building and running language models locally. + Ollama(OllamaEmbedderSettings), + /// Supports arbitrary embedders which supply a [REST](https://en.wikipedia.org/wiki/REST) interface + REST(GenericRestEmbedderSettings), + /// Provide custom embeddings + /// + /// When using a custom embedder, you must vectorize both your documents (both for adding and updating documents) and user queries + UserProvided(UserProvidedEmbedderSettings), +} + +/// Settings for configuring [Ollama](https://ollama.com/) embedders +/// +/// # Example +/// ``` +/// # use meilisearch_sdk::settings::HuggingFaceEmbedderSettings; +/// let embedder_setting = HuggingFaceEmbedderSettings { +/// model: Some("BAAI/bge-base-en-v1.5".to_string()), +/// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// ..Default::default() +/// }; +/// # let expected = r#"{"model":"BAAI/bge-base-en-v1.5","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}"}"#; +/// # let expected: HuggingFaceEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # assert_eq!(embedder_setting, expected); +/// ``` +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct HuggingFaceEmbedderSettings { + /// the [BERT embedding model](https://en.wikipedia.org/wiki/BERT_(language_model)) you want to use from [HuggingFace](https://huggingface.co) + /// Defaults to `"BAAI/bge-base-en-v1.5"` + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + /// revisions allow you to pin a specific version of a model, using a commit hash, tag or branch + /// this allows (according to [huggingface](https://huggingface.co/transformers/v4.8.2/model_sharing.html)): + /// - built-in versioning + /// - access control + /// - scalability + #[serde(skip_serializing_if = "Option::is_none")] + pub revision: Option, + /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. + /// + /// if present, `document_template` must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use `{{ doc.attribute }}` to access document field values. + /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. + /// + /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` + /// + /// Default: + /// ```raw + /// {% for field in fields %} + /// {% if field.is_searchable and not field.value == nil %} + /// {{ field.name }}: {{ field.value }}\n + /// {% endif %} + /// {% endfor %} + /// ``` + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template: Option, +} + +/// Settings for configuring [OpenAI](https://openai.com/) embedders +/// +/// # Example +/// ``` +/// # use meilisearch_sdk::settings::OpenAIEmbedderSettings; +/// let embedder_setting = OpenAIEmbedderSettings { +/// api_key: "anOpenAIApiKey".to_string(), +/// model: Some("text-embedding-3-small".to_string()), +/// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// dimensions: Some(1536), +/// ..Default::default() +/// }; +/// # let expected = r#"{"apiKey":"anOpenAIApiKey","model":"text-embedding-3-small","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}","dimensions":1536}"#; +/// # let expected: OpenAIEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # assert_eq!(embedder_setting, expected); +/// ``` +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct OpenAIEmbedderSettings { + /// API key used to authorize against OpenAI. + /// [Generate an API key](https://platform.openai.com/api-keys) from your OpenAI account. + /// Use [tier 2 keys](https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-two) or above for optimal performance. + pub api_key: String, + /// The openapi model name + /// Default: `text-embedding-3-small` + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + /// Defaults to the default for said model name + #[serde(skip_serializing_if = "Option::is_none")] + pub dimensions: Option, + /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. + /// + /// if present, `document_template` must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use `{{ doc.attribute }}` to access document field values. + /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. + /// + /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` + /// + /// Default: + /// ```raw + /// {% for field in fields %} + /// {% if field.is_searchable and not field.value == nil %} + /// {{ field.name }}: {{ field.value }}\n + /// {% endif %} + /// {% endfor %} + /// ``` + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template: Option, +} + +/// Settings for configuring [Ollama](https://ollama.com/) embedders +/// +/// # Example +/// ``` +/// # use meilisearch_sdk::settings::OllamaEmbedderSettings; +/// let embedder_setting = OllamaEmbedderSettings { +/// url: Some("http://localhost:11434/api/embeddings".to_string()), +/// api_key: Some("foobarbaz".to_string()), +/// model: "nomic-embed-text".to_string(), +/// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// }; +/// # let expected = r#"{"url":"http://localhost:11434/api/embeddings","apiKey":"foobarbaz","model":"nomic-embed-text","documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}"}"#; +/// # let expected: OllamaEmbedderSettings = serde_json::from_str(expected).unwrap(); +/// # assert_eq!(embedder_setting, expected); +/// ``` +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct OllamaEmbedderSettings { + /// Mandatory, full URL to the embedding endpoint. + /// Must be parseable as a URL. + /// If not specified, [Meilisearch](https://www.meilisearch.com/) (**not the sdk you are currently using**) will try to fetch the `MEILI_OLLAMA_URL` environment variable + /// Example: `"http://localhost:11434/api/embeddings"` + #[serde(skip_serializing_if = "Option::is_none")] + pub url: Option, + /// Optional, token used to authenticate against [Ollama](https://ollama.com/) + /// Example: `"foobarbaz"` + #[serde(skip_serializing_if = "Option::is_none")] + pub api_key: Option, + /// See https://ollama.com/library?q=embed for suitable embedding models + /// + /// # Example embedding models + /// + /// | Model | Parameter | Size | + /// |--------------------------|--------------|-----------------------------------------------------------------| + /// | `mxbai-embed-large` | `334M` | [View model](https://ollama.com/library/mxbai-embed-large) | + /// | `nomic-embed-text` | `137M` | [View model](https://ollama.com/library/nomic-embed-text) | + /// | `all-minilm` | `23M`,`33M` | [View model](https://ollama.com/library/all-minilm) | + /// | `snowflake-arctic-embed` | varies | [View model](https://ollama.com/library/snowflake-arctic-embed) | + pub model: String, + /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. + /// + /// if present, `document_template` must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use `{{ doc.attribute }}` to access document field values. + /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. + /// + /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` + /// + /// Default: + /// ```raw + /// {% for field in fields %} + /// {% if field.is_searchable and not field.value == nil %} + /// {{ field.name }}: {{ field.value }}\n + /// {% endif %} + /// {% endfor %} + /// ``` + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template: Option, +} + +/// Settings for configuring generic [REST](https://en.wikipedia.org/wiki/REST) embedders +/// +/// # Example +/// ``` +/// # use std::collections::HashMap; +/// # use meilisearch_sdk::settings::{GenericRestEmbedderSettings}; +/// use serde_json::Value; +/// let embedder_setting = GenericRestEmbedderSettings { +/// url: Some("http://localhost:12345/api/v1/embed".to_string()), +/// api_key: Some("SOURCE_API_KEY".to_string()), +/// dimensions: Some(512), +/// document_template: Some("A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}".to_string()), +/// request: HashMap::from([ +/// ("model".to_string(), Value::from("MODEL_NAME")), +/// ("prompt".to_string(), Value::from("{{text}}")) +/// ]), +/// response: HashMap::from([ +/// ("model".to_string(), Value::from("{{embedding}}")) +/// ]), +/// headers: HashMap::from([ +/// ("X-MAGIC".to_string(), "open sesame".to_string()) +/// ]), +/// }; +/// # let expected = serde_json::json!({ +/// # "url":"http://localhost:12345/api/v1/embed", +/// # "apiKey":"SOURCE_API_KEY", +/// # "dimensions":512, +/// # "documentTemplate":"A document titled {{doc.title}} whose description starts with {{doc.overview|truncatewords: 20}}", +/// # "request":{"prompt":"{{text}}","model":"MODEL_NAME"}, +/// # "response":{"model":"{{embedding}}"}, +/// # "headers":{"X-MAGIC":"open sesame"} +/// # }); +/// # let expected: GenericRestEmbedderSettings = serde_json::from_value(expected).unwrap(); +/// # assert_eq!(embedder_setting, expected); +/// ``` +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct GenericRestEmbedderSettings { + /// Mandatory, full URL to the embedding endpoint + /// Must be parseable as a URL. + /// If not specified, [Meilisearch](https://www.meilisearch.com/) (**not the sdk you are currently using**) will try to fetch the `MEILI_OLLAMA_URL` environment variable + /// Example: `"http://localhost:12345/api/v1/embed"` + #[serde(skip_serializing_if = "Option::is_none")] + pub url: Option, + /// Optional, passed as Bearer in the Authorization header + /// Example: `"187HFLDH97CNHN"` + #[serde(skip_serializing_if = "Option::is_none")] + pub api_key: Option, + /// Optional + /// Inferred with a dummy request if missing + #[serde(skip_serializing_if = "Option::is_none")] + pub dimensions: Option, + /// Use it to customize the data you send to the embedder. It is highly recommended you configure a custom template for your documents. + /// + /// if present, `document_template` must be a [Liquid template](https://shopify.github.io/liquid/). + /// Use `{{ doc.attribute }}` to access document field values. + /// Meilisearch also exposes a `{{ fields }}` array containing one object per document field, which you may access with `{{ field.name }}` and `{{ field.value }}`. + /// + /// For best results, use short strings indicating the type of document in that index, only include highly relevant document fields, and truncate long fields. + /// Example: `"A document titled '{{doc.title}}' whose description starts with {{doc.overview|truncatewords: 20}}"` + /// + /// Default: + /// ```raw + /// {% for field in fields %} + /// {% if field.is_searchable and not field.value == nil %} + /// {{ field.name }}: {{ field.value }}\n + /// {% endif %} + /// {% endfor %} + /// ``` + #[serde(skip_serializing_if = "Option::is_none")] + pub document_template: Option, + /// A JSON value that represents the request made by Meilisearch to the remote embedder. + /// The text to embed must be replaced by the placeholder value `“{{text}}”`. + /// + /// Example: + /// ```json + /// { + /// "model": "MODEL_NAME", + /// "prompt": "{{text}}" + /// } + /// ``` + #[serde(skip_serializing_if = "HashMap::is_empty")] + pub request: HashMap, + /// A JSON value that represents a fragment of the response made by the remote embedder to Meilisearch. + /// The embedding must be replaced by the placeholder value `"{{embedding}}"` + /// + /// Example: + /// ```json + /// { + /// "embedding": "{{embedding}}" + /// } + /// ``` + #[serde(skip_serializing_if = "HashMap::is_empty")] + pub response: HashMap, + /// JSON object whose keys represent the name and values of additional headers to send in requests. + /// + /// Embedding requests sent from Meilisearch to a remote REST embedder by default contain these headers: + /// + /// - if `api_key` was provided: `Authorization: Bearer ` + /// - always: `Content-Type: application/json` + /// + /// If `headers` is empty, only `Authorization` and `Content-Type` are sent, as described above. + /// If `headers` contains `Authorization` and `Content-Type`, the declared values will override the ones that are sent by default. + #[serde(skip_serializing_if = "HashMap::is_empty")] + pub headers: HashMap, +} + +/// Settings for user provided embedder +/// +/// When using a custom embedder, you must vectorize both your documents and user queries. +#[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq, Copy)] +pub struct UserProvidedEmbedderSettings { + /// dimensions of your custom embedding + pub dimensions: usize, +} + #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] #[serde(rename_all = "camelCase")] pub struct LocalizedAttributes { @@ -43,7 +340,7 @@ pub struct LocalizedAttributes { pub attribute_patterns: Vec, } -/// Struct reprensenting a set of settings. +/// Struct representing a set of settings. /// /// You can build this struct using the builder syntax. /// @@ -110,6 +407,9 @@ pub struct Settings { /// Proximity precision settings. #[serde(skip_serializing_if = "Option::is_none")] pub proximity_precision: Option, + /// Settings how the embeddings for the vector search feature are generated + #[serde(skip_serializing_if = "Option::is_none")] + pub embedders: Option>, /// SearchCutoffMs settings. #[serde(skip_serializing_if = "Option::is_none")] pub search_cutoff_ms: Option, @@ -308,6 +608,23 @@ impl Settings { } } + /// Set the [embedders](https://www.meilisearch.com/docs/learn/vector_search) of the [Index]. + #[must_use] + pub fn with_embedders(self, embedders: HashMap) -> Settings + where + S: AsRef, + { + Settings { + embedders: Some( + embedders + .into_iter() + .map(|(key, value)| (key.as_ref().to_string(), value)) + .collect(), + ), + ..self + } + } + pub fn with_search_cutoff(self, search_cutoff_ms: u64) -> Settings { Settings { search_cutoff_ms: Some(search_cutoff_ms), @@ -804,7 +1121,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("get_typo_tolerance", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let index = client.index("get_typo_tolerance"); /// @@ -826,6 +1143,45 @@ impl Index { .await } + /// Get [embedders](https://www.meilisearch.com/docs/learn/vector_search) of the [Index]. + /// + /// ``` + /// # use std::collections::HashMap; + /// # use std::string::String; + /// # use meilisearch_sdk::{indexes::*,settings::Embedder,settings::UserProvidedEmbedderSettings,settings::Settings,client::*}; + /// # + /// # let MEILISEARCH_URL = option_env!("MEILISEARCH_URL").unwrap_or("http://localhost:7700"); + /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); + /// # + /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # client.create_index("get_embedders", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); + /// let index = client.index("get_embedders"); + /// # + /// # let t = index.set_settings(&Settings{ + /// # embedders:Some(HashMap::from([(String::from("default"),Embedder::UserProvided(UserProvidedEmbedderSettings{dimensions:1}))])), + /// # ..Settings::default() + /// # }).await.unwrap(); + /// # t.wait_for_completion(&client, None, None).await.unwrap(); + /// let embedders = index.get_embedders().await.unwrap(); + /// # index.delete().await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); + /// # }); + /// ``` + pub async fn get_embedders(&self) -> Result, Error> { + self.client + .http_client + .request::<(), (), Option>>( + &format!( + "{}/indexes/{}/settings/embedders", + self.client.host, self.uid + ), + Method::Get { query: () }, + 200, + ) + .await + .map(|r| r.unwrap_or_default()) + } + /// Get [search cutoff](https://www.meilisearch.com/docs/reference/api/settings#search-cutoff) settings of the [Index]. /// /// # Example @@ -1473,7 +1829,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("set_typo_tolerance", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let mut index = client.index("set_typo_tolerance"); /// @@ -1601,7 +1957,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("set_proximity_precision", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let mut index = client.index("set_proximity_precision"); /// @@ -2117,7 +2473,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("reset_typo_tolerance", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let mut index = client.index("reset_typo_tolerance"); /// @@ -2150,7 +2506,7 @@ impl Index { /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); /// # /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { - /// let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); /// # client.create_index("reset_proximity_precision", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); /// let mut index = client.index("reset_proximity_precision"); /// @@ -2172,6 +2528,39 @@ impl Index { .await } + /// Reset [embedders](https://www.meilisearch.com/docs/learn/vector_search) of the [Index]. + /// + /// # Example + /// + /// ``` + /// # use meilisearch_sdk::{client::*, indexes::*, settings::Settings}; + /// # + /// # let MEILISEARCH_URL = option_env!("MEILISEARCH_URL").unwrap_or("http://localhost:7700"); + /// # let MEILISEARCH_API_KEY = option_env!("MEILISEARCH_API_KEY").unwrap_or("masterKey"); + /// # + /// # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { + /// # let client = Client::new(MEILISEARCH_URL, Some(MEILISEARCH_API_KEY)).unwrap(); + /// # client.create_index("reset_embedders", None).await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); + /// let mut index = client.index("reset_embedders"); + /// + /// let task = index.reset_embedders().await.unwrap(); + /// # index.delete().await.unwrap().wait_for_completion(&client, None, None).await.unwrap(); + /// # }); + /// ``` + pub async fn reset_embedders(&self) -> Result { + self.client + .http_client + .request::<(), (), TaskInfo>( + &format!( + "{}/indexes/{}/settings/embedders", + self.client.host, self.uid + ), + Method::Delete { query: () }, + 202, + ) + .await + } + /// Reset [search cutoff](https://www.meilisearch.com/docs/reference/api/settings#search-cutoff) settings of the [Index]. /// /// # Example @@ -2338,6 +2727,13 @@ mod tests { assert_eq!(faceting, res); } + #[meilisearch_test] + async fn test_get_embeddings(index: Index) { + let res = index.get_embedders().await.unwrap(); + + assert_eq!(HashMap::new(), res); + } + #[meilisearch_test] async fn test_set_faceting(client: Client, index: Index) { let faceting = FacetingSettings { @@ -2364,6 +2760,16 @@ mod tests { assert_eq!(faceting, res); } + #[meilisearch_test] + async fn test_reset_embedders(client: Client, index: Index) { + let task_info = index.reset_embedders().await.unwrap(); + client.wait_for_task(task_info, None, None).await.unwrap(); + + let res = index.get_embedders().await.unwrap(); + + assert_eq!(HashMap::new(), res); + } + #[meilisearch_test] async fn test_get_dictionary(index: Index) { let dictionary: Vec = vec![]; @@ -2540,6 +2946,21 @@ mod tests { assert_eq!(expected, res); } + #[meilisearch_test] + async fn test_set_embedding_settings(client: Client, index: Index) { + let custom_embedder = + Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 2 }); + let embeddings = HashMap::from([("default".into(), custom_embedder)]); + let settings = Settings::new().with_embedders(embeddings.clone()); + + let task_info = index.set_settings(&settings).await.unwrap(); + client.wait_for_task(task_info, None, None).await.unwrap(); + + let res = index.get_embedders().await.unwrap(); + + assert_eq!(embeddings, res); + } + #[meilisearch_test] async fn test_reset_proximity_precision(index: Index) { let expected = "byWord".to_string();