Skip to content

Commit

Permalink
Start switching
Browse files Browse the repository at this point in the history
  • Loading branch information
Mubelotix committed Nov 5, 2024
1 parent 6024a19 commit 2193616
Show file tree
Hide file tree
Showing 14 changed files with 617 additions and 763 deletions.
448 changes: 349 additions & 99 deletions Cargo.lock

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions daemon/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@ sha2-derive = "0.1"
faster-pest = "0.2.0-alpha.1"
word-lists = {path="../word-lists"}
bimap = "0.6"
tantivy = "0.22"
#schemas = { path="../../schemas", features=["serde"] }
heed = { git="https://github.com/meilisearch/heed", tag="v0.12.7", default-features=false, features=["read-txn-no-tls"], optional=true }

[features]
default = []
database-lmdb = ["heed/lmdb"]
database-mdbx = ["heed/mdbx"]
default = ["persist-index"]
database-lmdb = ["persist-index"] # Deprecated features
database-mdbx = ["persist-index"] # Deprecated features
persist-index = []
207 changes: 0 additions & 207 deletions daemon/src/database.rs

This file was deleted.

27 changes: 15 additions & 12 deletions daemon/src/documents.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ use scraper::{Selector, Html, ElementRef};
use crate::prelude::*;

pub struct DocumentInspectionReport {
pub words: Vec<String>,
pub filters: HashMap<&'static str, String>,
pub text_content: String,
pub description: Option<String>,
// TODO: add structured data to documentinspectionreport
}

pub fn inspect_document(raw: Vec<u8>) -> Option<DocumentInspectionReport> {
Expand Down Expand Up @@ -34,34 +35,36 @@ fn inspect_document_html(raw: &str) -> Option<DocumentInspectionReport> {
let body_selector = Selector::parse("body").expect("Invalid body selector");
let body_el = document.select(&body_selector).next();

fn list_words(el: ElementRef, words: &mut Vec<String>) {
fn list_words(el: ElementRef, text_content: &mut String) {
if ["script", "style"].contains(&el.value().name()) {
return;
}
for child in el.children() {
match child.value() {
scraper::node::Node::Element(_) => {
let child_ref = ElementRef::wrap(child).expect("Child isn't an element");
list_words(child_ref, words)
list_words(child_ref, text_content)
},
scraper::node::Node::Text(text) => {
let text = text.to_lowercase();
words.extend(text
.split(|c: char| !c.is_ascii_alphanumeric())
.filter(|w| w.len() >= 3)
.map(|w| w.to_string()))
text_content.push(' ');
text_content.push_str(text.to_string().trim());
},
_ => (),
}
}

}

let mut words = Vec::new();
let mut text_content = String::new();
if let Some(body_el) = body_el {
list_words(body_el, &mut words);
list_words(body_el, &mut text_content);
}

// Retrieve description
let description_selector = Selector::parse("meta[name=description]").expect("Invalid description selector");
let description_el = document.select(&description_selector).next();
let description = description_el.and_then(|el| el.value().attr("content").map(|c| c.to_string()));

// Get lang
let html_selector = Selector::parse("html").expect("Invalid html selector");
let html_el = document.select(&html_selector).next();
Expand All @@ -72,7 +75,7 @@ fn inspect_document_html(raw: &str) -> Option<DocumentInspectionReport> {
.unwrap_or(String::from("unknown"));
filters.insert("lang", lang);

Some(DocumentInspectionReport { words, filters })
Some(DocumentInspectionReport { text_content, description })
}

#[allow(clippy::question_mark)]
Expand Down
Loading

0 comments on commit 2193616

Please sign in to comment.