diff --git a/crates/jmap/src/lib.rs b/crates/jmap/src/lib.rs
index 459275d7..4d85ae18 100644
--- a/crates/jmap/src/lib.rs
+++ b/crates/jmap/src/lib.rs
@@ -227,11 +227,11 @@ impl JMAP {
.await
.failed("Unable to open blob store"),
));*/
- //let fts_store = FtsStore::Store(store.clone());
- let fts_store = ElasticSearchStore::open(config)
- .await
- .failed("Unable to open FTS store")
- .into();
+ let fts_store = FtsStore::Store(store.clone());
+ /*let fts_store = ElasticSearchStore::open(config)
+ .await
+ .failed("Unable to open FTS store")
+ .into();*/
let jmap_server = Arc::new(JMAP {
directory: directory_config
diff --git a/crates/store/src/backend/elastic/index.rs b/crates/store/src/backend/elastic/index.rs
index ab9813a9..3d2a867f 100644
--- a/crates/store/src/backend/elastic/index.rs
+++ b/crates/store/src/backend/elastic/index.rs
@@ -1,3 +1,26 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
use std::{borrow::Cow, fmt::Display};
use elasticsearch::{DeleteByQueryParts, IndexParts};
diff --git a/crates/store/src/backend/elastic/mod.rs b/crates/store/src/backend/elastic/mod.rs
index a8bbedf8..68b3204e 100644
--- a/crates/store/src/backend/elastic/mod.rs
+++ b/crates/store/src/backend/elastic/mod.rs
@@ -1,3 +1,26 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
use elasticsearch::{
auth::Credentials,
cert::CertificateValidation,
diff --git a/crates/store/src/backend/elastic/query.rs b/crates/store/src/backend/elastic/query.rs
index 79650cd4..dfdedbae 100644
--- a/crates/store/src/backend/elastic/query.rs
+++ b/crates/store/src/backend/elastic/query.rs
@@ -1,3 +1,26 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
use std::{borrow::Cow, fmt::Display};
use elasticsearch::SearchParts;
diff --git a/crates/store/src/fts/index.rs b/crates/store/src/fts/index.rs
index 596b737e..6894b2b9 100644
--- a/crates/store/src/fts/index.rs
+++ b/crates/store/src/fts/index.rs
@@ -21,7 +21,7 @@
* for more details.
*/
-use std::{borrow::Cow, fmt::Display};
+use std::{borrow::Cow, collections::BTreeSet, fmt::Display};
use ahash::{AHashMap, AHashSet};
use nlp::{
@@ -32,6 +32,7 @@ use nlp::{
},
tokenizers::word::WordTokenizer,
};
+use utils::codec::leb128::Leb128Reader;
use crate::{
backend::MAX_TOKEN_LENGTH,
@@ -170,6 +171,7 @@ impl Store {
let default_language = detect
.most_frequent_language()
.unwrap_or(document.default_language);
+ let mut bigrams = BTreeSet::new();
for (field, language, text) in parts.into_iter() {
let language = if language != Language::Unknown {
@@ -182,10 +184,7 @@ impl Store {
let mut last_token = Cow::Borrowed("");
for token in Stemmer::new(&text, language, MAX_TOKEN_LENGTH) {
if !last_token.is_empty() {
- tokens
- .entry(BitmapHash::new(&format!("{} {}", last_token, token.word)))
- .or_default()
- .insert(TokenType::bigram(field));
+ bigrams.insert(BitmapHash::new(&format!("{} {}", last_token, token.word)).hash);
}
tokens
@@ -212,6 +211,13 @@ impl Store {
let mut serializer = KeySerializer::new(tokens.len() * U64_LEN * 2);
let mut keys = Vec::with_capacity(tokens.len());
+ // Write bigrams
+ serializer = serializer.write_leb128(bigrams.len());
+ for bigram in bigrams {
+ serializer = serializer.write(bigram.as_slice());
+ }
+
+ // Write index keys
for (hash, fields) in tokens.into_iter() {
serializer = serializer
.write(hash.hash.as_slice())
@@ -336,7 +342,21 @@ impl Deserialize for TermIndex {
let bytes = lz4_flex::decompress_size_prepended(bytes)
.map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?;
let mut ops = Vec::new();
- let mut bytes = bytes.iter().peekable();
+
+ // Skip bigrams
+ let (num_items, pos) =
+ bytes
+ .as_slice()
+ .read_leb128::()
+ .ok_or(Error::InternalError(
+ "Failed to read term index marker".to_string(),
+ ))?;
+
+ let mut bytes = bytes
+ .get(pos + (num_items * 8)..)
+ .unwrap_or_default()
+ .iter()
+ .peekable();
while bytes.peek().is_some() {
let mut hash = BitmapHash {
diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs
index 6be67a75..e549956e 100644
--- a/crates/store/src/fts/query.rs
+++ b/crates/store/src/fts/query.rs
@@ -22,20 +22,32 @@
*/
use std::{
+ borrow::Cow,
fmt::Display,
ops::{BitAndAssign, BitOrAssign, BitXorAssign},
};
+use ahash::AHashSet;
use nlp::language::stemmer::Stemmer;
use roaring::RoaringBitmap;
+use utils::codec::leb128::Leb128Reader;
-use crate::{backend::MAX_TOKEN_LENGTH, fts::FtsFilter, write::BitmapClass, BitmapKey, Store};
+use crate::{
+ backend::MAX_TOKEN_LENGTH,
+ fts::FtsFilter,
+ write::{BitmapClass, BitmapHash, ValueClass},
+ BitmapKey, Deserialize, Error, Store, ValueKey,
+};
struct State + Display + Clone + std::fmt::Debug> {
pub op: FtsFilter,
pub bm: Option,
}
+struct BigramIndex {
+ grams: Vec<[u8; 8]>,
+}
+
impl Store {
pub async fn fts_query + Display + Clone + std::fmt::Debug>(
&self,
@@ -59,34 +71,60 @@ impl Store {
language,
} => {
let field: u8 = field.clone().into();
+ let mut keys = Vec::new();
+ let mut bigrams = AHashSet::new();
+ let mut last_token = Cow::Borrowed("");
+ for token in language.tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH) {
+ keys.push(BitmapKey {
+ account_id,
+ collection,
+ class: BitmapClass::word(token.word.as_ref(), field),
+ block_num: 0,
+ });
- let tokens = language
- .tokenize_text(text.as_ref(), MAX_TOKEN_LENGTH)
- .map(|t| t.word)
- .collect::>();
- let keys = if tokens.len() > 1 {
- tokens
- .windows(2)
- .map(|bg| BitmapKey {
- account_id,
- collection,
- class: BitmapClass::bigram(format!("{} {}", bg[0], bg[1]), field),
- block_num: 0,
- })
- .collect::>()
- } else {
- tokens
- .into_iter()
- .map(|word| BitmapKey {
- account_id,
- collection,
- class: BitmapClass::word(word.as_ref(), field),
- block_num: 0,
- })
- .collect::>()
- };
+ if !last_token.is_empty() {
+ bigrams.insert(
+ BitmapHash::new(&format!("{} {}", last_token, token.word)).hash,
+ );
+ }
- self.get_bitmaps_intersection(keys).await?
+ last_token = token.word;
+ }
+
+ match keys.len().cmp(&1) {
+ std::cmp::Ordering::Less => None,
+ std::cmp::Ordering::Equal => self.get_bitmaps_intersection(keys).await?,
+ std::cmp::Ordering::Greater => {
+ if let Some(document_ids) = self.get_bitmaps_intersection(keys).await? {
+ let mut results = RoaringBitmap::new();
+ for document_id in document_ids {
+ if let Some(bigram_index) = self
+ .get_value::(ValueKey {
+ account_id,
+ collection,
+ document_id,
+ class: ValueClass::TermIndex,
+ })
+ .await?
+ {
+ if bigrams.iter().all(|bigram| {
+ bigram_index.grams.binary_search(bigram).is_ok()
+ }) {
+ results.insert(document_id);
+ }
+ }
+ }
+
+ if !results.is_empty() {
+ Some(results)
+ } else {
+ None
+ }
+ } else {
+ None
+ }
+ }
+ }
}
FtsFilter::Contains {
field,
@@ -238,6 +276,27 @@ impl Store {
}
}
+impl Deserialize for BigramIndex {
+ fn deserialize(bytes: &[u8]) -> crate::Result {
+ let bytes = lz4_flex::decompress_size_prepended(bytes)
+ .map_err(|_| Error::InternalError("Failed to decompress term index".to_string()))?;
+
+ let (num_items, pos) = bytes.read_leb128::().ok_or(Error::InternalError(
+ "Failed to read term index marker".to_string(),
+ ))?;
+
+ bytes
+ .get(pos..pos + (num_items * 8))
+ .map(|bytes| Self {
+ grams: bytes
+ .chunks_exact(8)
+ .map(|chunk| chunk.try_into().unwrap())
+ .collect(),
+ })
+ .ok_or_else(|| Error::InternalError("Failed to read term index".to_string()))
+ }
+}
+
impl + Display + Clone + std::fmt::Debug> From> for State {
fn from(value: FtsFilter) -> Self {
Self {
diff --git a/crates/store/src/write/hash.rs b/crates/store/src/write/hash.rs
index e1e13723..f0c882f8 100644
--- a/crates/store/src/write/hash.rs
+++ b/crates/store/src/write/hash.rs
@@ -34,18 +34,18 @@ impl BitmapClass {
}
pub fn stemmed(token: impl AsRef<[u8]>, field: impl Into) -> Self {
- BitmapClass::Text {
- field: field.into() | 1 << 6,
- token: BitmapHash::new(token),
- }
- }
-
- pub fn bigram(token: impl AsRef<[u8]>, field: impl Into) -> Self {
BitmapClass::Text {
field: field.into() | 1 << 7,
token: BitmapHash::new(token),
}
}
+
+ /*pub fn bigram(token: impl AsRef<[u8]>, field: impl Into) -> Self {
+ BitmapClass::Text {
+ field: field.into() | 1 << 7,
+ token: BitmapHash::new(token),
+ }
+ }*/
}
impl BitmapHash {
@@ -84,75 +84,10 @@ impl TokenType {
}
pub fn stemmed(field: u8) -> u8 {
- 1 << 6 | field
- }
-
- pub fn bigram(field: u8) -> u8 {
1 << 7 | field
}
+
+ /*pub fn bigram(field: u8) -> u8 {
+ 1 << 7 | field
+ }*/
}
-
-/*
-
-const AHASHER: ahash::RandomState = ahash::RandomState::with_seeds(
- 0xaf1f2242106c64b3,
- 0x60ca4cfb4b3ed0ce,
- 0xc7dbc0bb615e82b3,
- 0x520ad065378daf88,
-);
-lazy_static::lazy_static! {
- static ref SIPHASHER: siphasher::sip::SipHasher13 =
- siphasher::sip::SipHasher13::new_with_keys(0x56205cbdba8f02a6, 0xbd0dbc4bb06d687b);
-}
-
- let h1 = xxhash_rust::xxh3::xxh3_64(item).to_le_bytes();
- let h2 = farmhash::hash64(item).to_le_bytes();
- let h3 = AHASHER.hash_one(item).to_le_bytes();
- let mut sh = *SIPHASHER;
- sh.write(item.as_ref());
- let h4 = sh.finish().to_le_bytes();
-
- result[..2].copy_from_slice(&h1[..2]);
- result[2..4].copy_from_slice(&h2[..2]);
- result[4..6].copy_from_slice(&h3[..2]);
- result[6..8].copy_from_slice(&h4[..2]);
-
-impl KeySerializer {
- pub fn hash_text(mut self, item: impl AsRef<[u8]>) -> Self {
- let item = item.as_ref();
-
- if item.len() <= 8 {
- self.buf.extend_from_slice(item);
- } else {
- let h1 = xxhash_rust::xxh3::xxh3_64(item).to_le_bytes();
- let h2 = farmhash::hash64(item).to_le_bytes();
- let h3 = AHASHER.hash_one(item).to_le_bytes();
- let mut sh = *SIPHASHER;
- sh.write(item.as_ref());
- let h4 = sh.finish().to_le_bytes();
-
- match item.len() {
- 9..=16 => {
- self.buf.extend_from_slice(&h1[..2]);
- self.buf.extend_from_slice(&h2[..2]);
- self.buf.extend_from_slice(&h3[..2]);
- self.buf.extend_from_slice(&h4[..2]);
- }
- 17..=32 => {
- self.buf.extend_from_slice(&h1[..3]);
- self.buf.extend_from_slice(&h2[..3]);
- self.buf.extend_from_slice(&h3[..3]);
- self.buf.extend_from_slice(&h4[..3]);
- }
- _ => {
- self.buf.extend_from_slice(&h1[..4]);
- self.buf.extend_from_slice(&h2[..4]);
- self.buf.extend_from_slice(&h3[..4]);
- self.buf.extend_from_slice(&h4[..4]);
- }
- }
- }
- self
- }
-}
-*/
diff --git a/tests/src/store/mod.rs b/tests/src/store/mod.rs
index b842cc19..f3a370e8 100644
--- a/tests/src/store/mod.rs
+++ b/tests/src/store/mod.rs
@@ -29,7 +29,10 @@ use std::io::Read;
use ::store::Store;
-use store::backend::{elastic::ElasticSearchStore, rocksdb::RocksDbStore};
+use store::{
+ backend::{elastic::ElasticSearchStore, rocksdb::RocksDbStore},
+ FtsStore,
+};
use utils::config::Config;
pub struct TempDir {
@@ -72,8 +75,8 @@ pub async fn store_tests() {
//let db: Store = PostgresStore::open(&Config::new(&config_file).unwrap())
//let db: Store = MysqlStore::open(&Config::new(&config_file).unwrap())
let db: Store = RocksDbStore::open(&config).await.unwrap().into();
- //let fts_store = FtsStore::from(db.clone());
- let fts_store = ElasticSearchStore::open(&config).await.unwrap().into();
+ let fts_store = FtsStore::from(db.clone());
+ //let fts_store = ElasticSearchStore::open(&config).await.unwrap().into();
if insert {
db.destroy().await;