diff --git a/crates/jmap/src/lib.rs b/crates/jmap/src/lib.rs index 459275d7..4d85ae18 100644 --- a/crates/jmap/src/lib.rs +++ b/crates/jmap/src/lib.rs @@ -227,11 +227,11 @@ impl JMAP { .await .failed("Unable to open blob store"), ));*/ - //let fts_store = FtsStore::Store(store.clone()); - let fts_store = ElasticSearchStore::open(config) - .await - .failed("Unable to open FTS store") - .into(); + let fts_store = FtsStore::Store(store.clone()); + /*let fts_store = ElasticSearchStore::open(config) + .await + .failed("Unable to open FTS store") + .into();*/ let jmap_server = Arc::new(JMAP { directory: directory_config diff --git a/crates/store/src/backend/elastic/index.rs b/crates/store/src/backend/elastic/index.rs index ab9813a9..3d2a867f 100644 --- a/crates/store/src/backend/elastic/index.rs +++ b/crates/store/src/backend/elastic/index.rs @@ -1,3 +1,26 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + use std::{borrow::Cow, fmt::Display}; use elasticsearch::{DeleteByQueryParts, IndexParts}; diff --git a/crates/store/src/backend/elastic/mod.rs b/crates/store/src/backend/elastic/mod.rs index a8bbedf8..68b3204e 100644 --- a/crates/store/src/backend/elastic/mod.rs +++ b/crates/store/src/backend/elastic/mod.rs @@ -1,3 +1,26 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + use elasticsearch::{ auth::Credentials, cert::CertificateValidation, diff --git a/crates/store/src/backend/elastic/query.rs b/crates/store/src/backend/elastic/query.rs index 79650cd4..dfdedbae 100644 --- a/crates/store/src/backend/elastic/query.rs +++ b/crates/store/src/backend/elastic/query.rs @@ -1,3 +1,26 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + use std::{borrow::Cow, fmt::Display}; use elasticsearch::SearchParts; diff --git a/crates/store/src/fts/index.rs b/crates/store/src/fts/index.rs index 596b737e..6894b2b9 100644 --- a/crates/store/src/fts/index.rs +++ b/crates/store/src/fts/index.rs @@ -21,7 +21,7 @@ * for more details. */ -use std::{borrow::Cow, fmt::Display}; +use std::{borrow::Cow, collections::BTreeSet, fmt::Display}; use ahash::{AHashMap, AHashSet}; use nlp::{ @@ -32,6 +32,7 @@ use nlp::{ }, tokenizers::word::WordTokenizer, }; +use utils::codec::leb128::Leb128Reader; use crate::{ backend::MAX_TOKEN_LENGTH, @@ -170,6 +171,7 @@ impl Store { let default_language = detect .most_frequent_language() .unwrap_or(document.default_language); + let mut bigrams = BTreeSet::new(); for (field, language, text) in parts.into_iter() { let language = if language != Language::Unknown { @@ -182,10 +184,7 @@ impl Store { let mut last_token = Cow::Borrowed(""); for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH) { if !last_token.is_empty() { - tokens - .entry(BitmapHash::new(&format!("{} {}", last_token, token.word))) - .or_default() - .insert(TokenType::bigram(field)); + bigrams.insert(BitmapHash::new(&format!("{} {}", last_token, token.word)).hash); } tokens @@ -212,6 +211,13 @@ impl Store { let mut serializer = KeySerializer::new(tokens.len() * U64_LEN * 2); let mut keys = Vec::with_capacity(tokens.len()); + // Write bigrams + serializer = serializer.write_leb128(bigrams.len()); + for bigram in bigrams { + serializer = serializer.write(bigram.as_slice()); + } + + // Write index keys for (hash, fields) in tokens.into_iter() { serializer = serializer .write(hash.hash.as_slice()) @@ -336,7 +342,21 @@ impl Deserialize for TermIndex { let bytes = lz4_flex::decompress_size_prepended(bytes) .map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?; let mut ops = Vec::new(); - let mut bytes = bytes.iter().peekable(); + + // Skip bigrams + let (num_items, pos) = + bytes + .as_slice() + .read_leb128::() + .ok_or(Error::InternalError( + "Failed to read term index marker".to_string(), + ))?; + + let mut bytes = bytes + .get(pos + (num_items * 8)..) + .unwrap_or_default() + .iter() + .peekable(); while bytes.peek().is_some() { let mut hash = BitmapHash { diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs index 6be67a75..e549956e 100644 --- a/crates/store/src/fts/query.rs +++ b/crates/store/src/fts/query.rs @@ -22,20 +22,32 @@ */ use std::{ + borrow::Cow, fmt::Display, ops::{BitAndAssign, BitOrAssign, BitXorAssign}, }; +use ahash::AHashSet; use nlp::language::stemmer::Stemmer; use roaring::RoaringBitmap; +use utils::codec::leb128::Leb128Reader; -use crate::{backend::MAX_TOKEN_LENGTH, fts::FtsFilter, write::BitmapClass, BitmapKey, Store}; +use crate::{ + backend::MAX_TOKEN_LENGTH, + fts::FtsFilter, + write::{BitmapClass, BitmapHash, ValueClass}, + BitmapKey, Deserialize, Error, Store, ValueKey, +}; struct State + Display + Clone + std::fmt::Debug> { pub op: FtsFilter, pub bm: Option, } +struct BigramIndex { + grams: Vec<[u8; 8]>, +} + impl Store { pub async fn fts_query + Display + Clone + std::fmt::Debug>( &self, @@ -59,34 +71,60 @@ impl Store { language, } => { let field: u8 = field.clone().into(); + let mut keys = Vec::new(); + let mut bigrams = AHashSet::new(); + let mut last_token = Cow::Borrowed(""); + for token in language.tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH) { + keys.push(BitmapKey { + account_id, + collection, + class: BitmapClass::word(token.word.as_ref(), field), + block_num: 0, + }); - let tokens = language - .tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH) - .map(|t| t.word) - .collect::>(); - let keys = if tokens.len() > 1 { - tokens - .windows(2) - .map(|bg| BitmapKey { - account_id, - collection, - class: BitmapClass::bigram(format!("{} {}", bg[0], bg[1]), field), - block_num: 0, - }) - .collect::>() - } else { - tokens - .into_iter() - .map(|word| BitmapKey { - account_id, - collection, - class: BitmapClass::word(word.as_ref(), field), - block_num: 0, - }) - .collect::>() - }; + if !last_token.is_empty() { + bigrams.insert( + BitmapHash::new(&format!("{} {}", last_token, token.word)).hash, + ); + } - self.get_bitmaps_intersection(keys).await? + last_token = token.word; + } + + match keys.len().cmp(&1) { + std::cmp::Ordering::Less => None, + std::cmp::Ordering::Equal => self.get_bitmaps_intersection(keys).await?, + std::cmp::Ordering::Greater => { + if let Some(document_ids) = self.get_bitmaps_intersection(keys).await? { + let mut results = RoaringBitmap::new(); + for document_id in document_ids { + if let Some(bigram_index) = self + .get_value::(ValueKey { + account_id, + collection, + document_id, + class: ValueClass::TermIndex, + }) + .await? + { + if bigrams.iter().all(|bigram| { + bigram_index.grams.binary_search(bigram).is_ok() + }) { + results.insert(document_id); + } + } + } + + if !results.is_empty() { + Some(results) + } else { + None + } + } else { + None + } + } + } } FtsFilter::Contains { field, @@ -238,6 +276,27 @@ impl Store { } } +impl Deserialize for BigramIndex { + fn deserialize(bytes: &[u8]) -> crate::Result { + let bytes = lz4_flex::decompress_size_prepended(bytes) + .map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?; + + let (num_items, pos) = bytes.read_leb128::().ok_or(Error::InternalError( + "Failed to read term index marker".to_string(), + ))?; + + bytes + .get(pos..pos + (num_items * 8)) + .map(|bytes| Self { + grams: bytes + .chunks_exact(8) + .map(|chunk| chunk.try_into().unwrap()) + .collect(), + }) + .ok_or_else(|| Error::InternalError("Failed to read term index".to_string())) + } +} + impl + Display + Clone + std::fmt::Debug> From> for State { fn from(value: FtsFilter) -> Self { Self { diff --git a/crates/store/src/write/hash.rs b/crates/store/src/write/hash.rs index e1e13723..f0c882f8 100644 --- a/crates/store/src/write/hash.rs +++ b/crates/store/src/write/hash.rs @@ -34,18 +34,18 @@ impl BitmapClass { } pub fn stemmed(token: impl AsRef<[u8]>, field: impl Into) -> Self { - BitmapClass::Text { - field: field.into() | 1 << 6, - token: BitmapHash::new(token), - } - } - - pub fn bigram(token: impl AsRef<[u8]>, field: impl Into) -> Self { BitmapClass::Text { field: field.into() | 1 << 7, token: BitmapHash::new(token), } } + + /*pub fn bigram(token: impl AsRef<[u8]>, field: impl Into) -> Self { + BitmapClass::Text { + field: field.into() | 1 << 7, + token: BitmapHash::new(token), + } + }*/ } impl BitmapHash { @@ -84,75 +84,10 @@ impl TokenType { } pub fn stemmed(field: u8) -> u8 { - 1 << 6 | field - } - - pub fn bigram(field: u8) -> u8 { 1 << 7 | field } + + /*pub fn bigram(field: u8) -> u8 { + 1 << 7 | field + }*/ } - -/* - -const AHASHER: ahash::RandomState = ahash::RandomState::with_seeds( - 0xaf1f2242106c64b3, - 0x60ca4cfb4b3ed0ce, - 0xc7dbc0bb615e82b3, - 0x520ad065378daf88, -); -lazy_static::lazy_static! { - static ref SIPHASHER: siphasher::sip::SipHasher13 = - siphasher::sip::SipHasher13::new_with_keys(0x56205cbdba8f02a6, 0xbd0dbc4bb06d687b); -} - - let h1 = xxhash_rust::xxh3::xxh3_64(item).to_le_bytes(); - let h2 = farmhash::hash64(item).to_le_bytes(); - let h3 = AHASHER.hash_one(item).to_le_bytes(); - let mut sh = *SIPHASHER; - sh.write(item.as_ref()); - let h4 = sh.finish().to_le_bytes(); - - result[..2].copy_from_slice(&h1[..2]); - result[2..4].copy_from_slice(&h2[..2]); - result[4..6].copy_from_slice(&h3[..2]); - result[6..8].copy_from_slice(&h4[..2]); - -impl KeySerializer { - pub fn hash_text(mut self, item: impl AsRef<[u8]>) -> Self { - let item = item.as_ref(); - - if item.len() <= 8 { - self.buf.extend_from_slice(item); - } else { - let h1 = xxhash_rust::xxh3::xxh3_64(item).to_le_bytes(); - let h2 = farmhash::hash64(item).to_le_bytes(); - let h3 = AHASHER.hash_one(item).to_le_bytes(); - let mut sh = *SIPHASHER; - sh.write(item.as_ref()); - let h4 = sh.finish().to_le_bytes(); - - match item.len() { - 9..=16 => { - self.buf.extend_from_slice(&h1[..2]); - self.buf.extend_from_slice(&h2[..2]); - self.buf.extend_from_slice(&h3[..2]); - self.buf.extend_from_slice(&h4[..2]); - } - 17..=32 => { - self.buf.extend_from_slice(&h1[..3]); - self.buf.extend_from_slice(&h2[..3]); - self.buf.extend_from_slice(&h3[..3]); - self.buf.extend_from_slice(&h4[..3]); - } - _ => { - self.buf.extend_from_slice(&h1[..4]); - self.buf.extend_from_slice(&h2[..4]); - self.buf.extend_from_slice(&h3[..4]); - self.buf.extend_from_slice(&h4[..4]); - } - } - } - self - } -} -*/ diff --git a/tests/src/store/mod.rs b/tests/src/store/mod.rs index b842cc19..f3a370e8 100644 --- a/tests/src/store/mod.rs +++ b/tests/src/store/mod.rs @@ -29,7 +29,10 @@ use std::io::Read; use ::store::Store; -use store::backend::{elastic::ElasticSearchStore, rocksdb::RocksDbStore}; +use store::{ + backend::{elastic::ElasticSearchStore, rocksdb::RocksDbStore}, + FtsStore, +}; use utils::config::Config; pub struct TempDir { @@ -72,8 +75,8 @@ pub async fn store_tests() { //let db: Store = PostgresStore::open(&Config::new(&config_file).unwrap()) //let db: Store = MysqlStore::open(&Config::new(&config_file).unwrap()) let db: Store = RocksDbStore::open(&config).await.unwrap().into(); - //let fts_store = FtsStore::from(db.clone()); - let fts_store = ElasticSearchStore::open(&config).await.unwrap().into(); + let fts_store = FtsStore::from(db.clone()); + //let fts_store = ElasticSearchStore::open(&config).await.unwrap().into(); if insert { db.destroy().await;