mirror of
https://github.com/stalwartlabs/mail-server.git
synced 2025-09-09 13:34:26 +08:00
Message indexing.
This commit is contained in:
parent
9f63c86db9
commit
0778e446a6
21 changed files with 1115 additions and 64 deletions
|
@ -4,6 +4,9 @@ version = "0.1.0"
|
|||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
store = { path = "../store" }
|
||||
protocol = { path = "../protocol" }
|
||||
utils = { path = "/home/vagrant/code/utils" }
|
||||
mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "serde_support", "ludicrous_mode"] }
|
||||
mail-builder = { git = "https://github.com/stalwartlabs/mail-builder", features = ["ludicrous_mode"] }
|
||||
mail-send = { git = "https://github.com/stalwartlabs/mail-send" }
|
||||
|
|
74
crates/core/src/email/headers.rs
Normal file
74
crates/core/src/email/headers.rs
Normal file
|
@ -0,0 +1,74 @@
|
|||
use mail_parser::{Addr, HeaderValue};
|
||||
use protocol::{
|
||||
object::Object,
|
||||
types::{
|
||||
property::{HeaderForm, Property},
|
||||
value::Value,
|
||||
},
|
||||
};
|
||||
|
||||
pub trait IntoForm {
|
||||
fn into_form(self, form: &HeaderForm) -> Value;
|
||||
}
|
||||
|
||||
impl IntoForm for HeaderValue<'_> {
|
||||
fn into_form(self, form: &HeaderForm) -> Value {
|
||||
match (self, form) {
|
||||
(HeaderValue::Text(text), HeaderForm::Raw | HeaderForm::Text) => text.into(),
|
||||
(HeaderValue::TextList(texts), HeaderForm::Raw | HeaderForm::Text) => {
|
||||
texts.join(", ").into()
|
||||
}
|
||||
(HeaderValue::Text(text), HeaderForm::MessageIds) => Value::List(vec![text.into()]),
|
||||
(HeaderValue::TextList(texts), HeaderForm::MessageIds) => texts.into(),
|
||||
(HeaderValue::DateTime(datetime), HeaderForm::Date) => datetime.into(),
|
||||
(
|
||||
HeaderValue::Address(Addr {
|
||||
address: Some(addr),
|
||||
..
|
||||
}),
|
||||
HeaderForm::URLs,
|
||||
) if addr.contains(':') => Value::List(vec![addr.into()]),
|
||||
(HeaderValue::AddressList(addrlist), HeaderForm::URLs) => Value::List(
|
||||
addrlist
|
||||
.into_iter()
|
||||
.filter_map(|addr| match addr {
|
||||
Addr {
|
||||
address: Some(addr),
|
||||
..
|
||||
} if addr.contains(':') => Some(addr.into()),
|
||||
_ => None,
|
||||
})
|
||||
.collect(),
|
||||
),
|
||||
(HeaderValue::Address(addr), HeaderForm::Addresses) => Value::List(vec![addr.into()]),
|
||||
(HeaderValue::AddressList(addrlist), HeaderForm::Addresses) => addrlist.into(),
|
||||
(HeaderValue::Group(group), HeaderForm::Addresses) => group.addresses.into(),
|
||||
(HeaderValue::GroupList(grouplist), HeaderForm::Addresses) => Value::List(
|
||||
grouplist
|
||||
.into_iter()
|
||||
.flat_map(|group| group.addresses)
|
||||
.map(Into::into)
|
||||
.collect(),
|
||||
),
|
||||
(HeaderValue::Address(addr), HeaderForm::GroupedAddresses) => {
|
||||
Value::List(vec![Object::with_capacity(2)
|
||||
.with_property(Property::Name, Value::Null)
|
||||
.with_property(Property::Addresses, Value::List(vec![addr.into()]))
|
||||
.into()])
|
||||
}
|
||||
|
||||
(HeaderValue::AddressList(addrlist), HeaderForm::GroupedAddresses) => {
|
||||
Value::List(vec![Object::with_capacity(2)
|
||||
.with_property(Property::Name, Value::Null)
|
||||
.with_property(Property::Addresses, addrlist)
|
||||
.into()])
|
||||
}
|
||||
(HeaderValue::Group(group), HeaderForm::GroupedAddresses) => {
|
||||
Value::List(vec![group.into()])
|
||||
}
|
||||
(HeaderValue::GroupList(grouplist), HeaderForm::GroupedAddresses) => grouplist.into(),
|
||||
|
||||
_ => Value::Null,
|
||||
}
|
||||
}
|
||||
}
|
1
crates/core/src/email/import.rs
Normal file
1
crates/core/src/email/import.rs
Normal file
|
@ -0,0 +1 @@
|
|||
|
534
crates/core/src/email/index.rs
Normal file
534
crates/core/src/email/index.rs
Normal file
|
@ -0,0 +1,534 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use mail_parser::{
|
||||
decoders::html::html_to_text,
|
||||
parsers::{fields::thread::thread_name, preview::preview_text},
|
||||
Addr, GetHeader, Group, HeaderName, HeaderValue, Message, MessagePart, PartType, RfcHeader,
|
||||
};
|
||||
use protocol::{
|
||||
object::Object,
|
||||
types::{
|
||||
date::UTCDate,
|
||||
keyword::Keyword,
|
||||
property::{HeaderForm, Property},
|
||||
value::Value,
|
||||
},
|
||||
};
|
||||
use store::{
|
||||
fts::{builder::FtsIndexBuilder, Language},
|
||||
write::{BatchBuilder, F_INDEX, F_TOKENIZE, F_VALUE},
|
||||
};
|
||||
|
||||
use crate::email::headers::IntoForm;
|
||||
|
||||
pub const MAX_MESSAGE_PARTS: usize = 1000;
|
||||
pub const MAX_ID_LENGTH: usize = 100;
|
||||
pub const MAX_SORT_FIELD_LENGTH: usize = 255;
|
||||
pub const MAX_STORED_FIELD_LENGTH: usize = 512;
|
||||
pub const PREVIEW_LENGTH: usize = 256;
|
||||
|
||||
trait IndexMessage {
|
||||
fn index_message(
|
||||
&mut self,
|
||||
message: Message,
|
||||
keywords: Vec<Keyword>,
|
||||
mailbox_ids: Vec<u32>,
|
||||
received_at: u64,
|
||||
default_language: Language,
|
||||
) -> store::Result<()>;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
o id
|
||||
o blobId
|
||||
o threadId
|
||||
o mailboxIds
|
||||
o keywords
|
||||
o receivedAt
|
||||
|
||||
*/
|
||||
|
||||
impl IndexMessage for BatchBuilder {
|
||||
fn index_message(
|
||||
&mut self,
|
||||
message: Message,
|
||||
keywords: Vec<Keyword>,
|
||||
mailbox_ids: Vec<u32>,
|
||||
received_at: u64,
|
||||
default_language: Language,
|
||||
) -> store::Result<()> {
|
||||
let mut object = Object::with_capacity(10);
|
||||
|
||||
// Index keywords
|
||||
self.value(
|
||||
Property::Keywords,
|
||||
Value::from(keywords),
|
||||
F_VALUE | F_TOKENIZE,
|
||||
);
|
||||
|
||||
// Index mailboxIds
|
||||
self.value(
|
||||
Property::MailboxIds,
|
||||
Value::from(mailbox_ids),
|
||||
F_VALUE | F_TOKENIZE,
|
||||
);
|
||||
|
||||
// Index size
|
||||
object.append(Property::Size, message.raw_message.len());
|
||||
self.value(Property::Size, message.raw_message.len() as u32, F_INDEX);
|
||||
|
||||
// Index receivedAt
|
||||
object.append(
|
||||
Property::ReceivedAt,
|
||||
Value::Date(UTCDate::from_timestamp(received_at as i64)),
|
||||
);
|
||||
self.value(Property::ReceivedAt, received_at, F_INDEX);
|
||||
|
||||
let mut fts = FtsIndexBuilder::with_default_language(default_language);
|
||||
let mut seen_headers = [false; 40];
|
||||
let mut language = Language::Unknown;
|
||||
let mut has_attachments = false;
|
||||
let preview_part_id = message
|
||||
.text_body
|
||||
.first()
|
||||
.or_else(|| message.html_body.first())
|
||||
.copied()
|
||||
.unwrap_or(usize::MAX);
|
||||
|
||||
for (part_id, part) in message
|
||||
.parts
|
||||
.into_iter()
|
||||
.take(MAX_MESSAGE_PARTS)
|
||||
.enumerate()
|
||||
{
|
||||
let part_language = part.language().unwrap_or(language);
|
||||
if part_id == 0 {
|
||||
language = part_language;
|
||||
for header in part.headers.into_iter().rev() {
|
||||
if let HeaderName::Rfc(rfc_header) = header.name {
|
||||
match rfc_header {
|
||||
RfcHeader::MessageId
|
||||
| RfcHeader::InReplyTo
|
||||
| RfcHeader::References
|
||||
| RfcHeader::ResentMessageId => {
|
||||
match &header.value {
|
||||
HeaderValue::Text(id) if id.len() < MAX_ID_LENGTH => {
|
||||
self.value(Property::MessageId, id.as_ref(), F_INDEX);
|
||||
}
|
||||
HeaderValue::TextList(ids) => {
|
||||
for id in ids {
|
||||
if id.len() < MAX_ID_LENGTH {
|
||||
self.value(
|
||||
Property::MessageId,
|
||||
id.as_ref(),
|
||||
F_INDEX,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
if matches!(rfc_header, RfcHeader::MessageId | RfcHeader::InReplyTo)
|
||||
&& !seen_headers[rfc_header as usize]
|
||||
{
|
||||
object.append(
|
||||
rfc_header.into(),
|
||||
header
|
||||
.value
|
||||
.trim_text(MAX_STORED_FIELD_LENGTH)
|
||||
.into_form(&HeaderForm::MessageIds),
|
||||
);
|
||||
seen_headers[rfc_header as usize] = true;
|
||||
}
|
||||
}
|
||||
RfcHeader::From
|
||||
| RfcHeader::To
|
||||
| RfcHeader::Cc
|
||||
| RfcHeader::Bcc
|
||||
| RfcHeader::ReplyTo
|
||||
| RfcHeader::Sender => {
|
||||
let seen_header = seen_headers[rfc_header as usize];
|
||||
if matches!(
|
||||
rfc_header,
|
||||
RfcHeader::From
|
||||
| RfcHeader::To
|
||||
| RfcHeader::Cc
|
||||
| RfcHeader::Bcc
|
||||
) {
|
||||
let mut sort_text =
|
||||
String::with_capacity(MAX_SORT_FIELD_LENGTH);
|
||||
let mut found_addr = seen_header;
|
||||
let mut last_is_space = true;
|
||||
|
||||
header.value.visit_addresses(|value, is_addr| {
|
||||
if !found_addr {
|
||||
if !sort_text.is_empty() {
|
||||
sort_text.push(' ');
|
||||
last_is_space = true;
|
||||
}
|
||||
found_addr = is_addr;
|
||||
'outer: for ch in value.chars() {
|
||||
for ch in ch.to_lowercase() {
|
||||
if sort_text.len() < MAX_SORT_FIELD_LENGTH {
|
||||
let is_space = ch.is_whitespace();
|
||||
if !is_space || !last_is_space {
|
||||
sort_text.push(ch);
|
||||
last_is_space = is_space;
|
||||
}
|
||||
} else {
|
||||
found_addr = true;
|
||||
break 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Index an address name or email without stemming
|
||||
fts.index_raw(rfc_header, value);
|
||||
});
|
||||
|
||||
if !seen_header {
|
||||
// Add address to inverted index
|
||||
self.value(
|
||||
rfc_header,
|
||||
if !sort_text.is_empty() {
|
||||
&sort_text
|
||||
} else {
|
||||
"!"
|
||||
},
|
||||
F_INDEX,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if !seen_header {
|
||||
// Add address to object
|
||||
object.append(
|
||||
rfc_header.into(),
|
||||
header
|
||||
.value
|
||||
.trim_text(MAX_STORED_FIELD_LENGTH)
|
||||
.into_form(&HeaderForm::Addresses),
|
||||
);
|
||||
seen_headers[rfc_header as usize] = true;
|
||||
}
|
||||
}
|
||||
RfcHeader::Date => {
|
||||
if !seen_headers[rfc_header as usize] {
|
||||
if let HeaderValue::DateTime(datetime) = &header.value {
|
||||
self.value(
|
||||
Property::SentAt,
|
||||
datetime.to_timestamp() as u64,
|
||||
F_INDEX,
|
||||
);
|
||||
}
|
||||
object.append(
|
||||
Property::SentAt,
|
||||
header.value.into_form(&HeaderForm::Date),
|
||||
);
|
||||
seen_headers[rfc_header as usize] = true;
|
||||
}
|
||||
}
|
||||
RfcHeader::Subject => {
|
||||
// Index subject
|
||||
let subject = match &header.value {
|
||||
HeaderValue::Text(text) => text.clone(),
|
||||
HeaderValue::TextList(list) if !list.is_empty() => {
|
||||
list.first().unwrap().clone()
|
||||
}
|
||||
_ => "".into(),
|
||||
};
|
||||
|
||||
if !seen_headers[rfc_header as usize] {
|
||||
// Add to object
|
||||
object.append(
|
||||
Property::Subject,
|
||||
header
|
||||
.value
|
||||
.trim_text(MAX_STORED_FIELD_LENGTH)
|
||||
.into_form(&HeaderForm::Text),
|
||||
);
|
||||
|
||||
// Index thread name
|
||||
let thread_name = thread_name(&subject);
|
||||
self.value(
|
||||
Property::Subject,
|
||||
if !thread_name.is_empty() {
|
||||
thread_name.trim_text(MAX_SORT_FIELD_LENGTH)
|
||||
} else {
|
||||
"!"
|
||||
},
|
||||
F_INDEX,
|
||||
);
|
||||
|
||||
seen_headers[rfc_header as usize] = true;
|
||||
}
|
||||
|
||||
// Index subject for FTS
|
||||
fts.index(Property::Subject, subject, language);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match part.body {
|
||||
PartType::Text(text) => {
|
||||
if part_id == preview_part_id {
|
||||
object.append(
|
||||
Property::Preview,
|
||||
preview_text(text.clone(), PREVIEW_LENGTH),
|
||||
);
|
||||
}
|
||||
|
||||
if message.text_body.contains(&part_id) || message.html_body.contains(&part_id)
|
||||
{
|
||||
fts.index(Property::TextBody, text, part_language);
|
||||
} else {
|
||||
fts.index(Property::Attachments, text, part_language);
|
||||
has_attachments = true;
|
||||
}
|
||||
}
|
||||
PartType::Html(html) => {
|
||||
let text = html_to_text(&html);
|
||||
if part_id == preview_part_id {
|
||||
object.append(
|
||||
Property::Preview,
|
||||
preview_text(text.clone().into(), PREVIEW_LENGTH),
|
||||
);
|
||||
}
|
||||
|
||||
if message.text_body.contains(&part_id) || message.html_body.contains(&part_id)
|
||||
{
|
||||
fts.index(Property::TextBody, text, part_language);
|
||||
} else {
|
||||
fts.index(Property::Attachments, text, part_language);
|
||||
has_attachments = true;
|
||||
}
|
||||
}
|
||||
PartType::Binary(_) if !has_attachments => {
|
||||
has_attachments = true;
|
||||
}
|
||||
PartType::Message(mut nested_message) => {
|
||||
let nested_message_language = nested_message
|
||||
.root_part()
|
||||
.language()
|
||||
.unwrap_or(Language::Unknown);
|
||||
if let Some(HeaderValue::Text(subject)) =
|
||||
nested_message.remove_header_rfc(RfcHeader::Subject)
|
||||
{
|
||||
fts.index(
|
||||
Property::Attachments,
|
||||
subject.into_owned(),
|
||||
nested_message_language,
|
||||
);
|
||||
}
|
||||
|
||||
for sub_part in nested_message.parts.into_iter().take(MAX_MESSAGE_PARTS) {
|
||||
let language = sub_part.language().unwrap_or(nested_message_language);
|
||||
match sub_part.body {
|
||||
PartType::Text(text) => {
|
||||
fts.index(Property::Attachments, text, language);
|
||||
}
|
||||
PartType::Html(html) => {
|
||||
fts.index(Property::Attachments, html_to_text(&html), language);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
if !has_attachments {
|
||||
has_attachments = true;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Store and index hasAttachment property
|
||||
object.append(Property::HasAttachment, has_attachments);
|
||||
if has_attachments {
|
||||
self.bitmap(Property::HasAttachment, (), 0);
|
||||
}
|
||||
|
||||
// Store properties
|
||||
self.value(Property::BodyStructure, Value::from(object), F_VALUE);
|
||||
|
||||
// Store full text index
|
||||
self.custom(fts)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
trait GetContentLanguage {
|
||||
fn language(&self) -> Option<Language>;
|
||||
}
|
||||
|
||||
impl GetContentLanguage for MessagePart<'_> {
|
||||
fn language(&self) -> Option<Language> {
|
||||
self.headers.rfc(&RfcHeader::ContentLanguage).and_then(|v| {
|
||||
Language::from_iso_639(match v {
|
||||
HeaderValue::Text(v) => v.as_ref(),
|
||||
HeaderValue::TextList(v) => v.first()?,
|
||||
_ => {
|
||||
return None;
|
||||
}
|
||||
})
|
||||
.unwrap_or(Language::Unknown)
|
||||
.into()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
trait VisitAddresses {
|
||||
fn visit_addresses(&self, visitor: impl FnMut(&str, bool));
|
||||
}
|
||||
|
||||
impl VisitAddresses for HeaderValue<'_> {
|
||||
fn visit_addresses(&self, mut visitor: impl FnMut(&str, bool)) {
|
||||
match self {
|
||||
HeaderValue::Address(addr) => {
|
||||
if let Some(name) = &addr.name {
|
||||
visitor(name.as_ref(), false);
|
||||
}
|
||||
if let Some(addr) = &addr.address {
|
||||
visitor(addr.as_ref(), true);
|
||||
}
|
||||
}
|
||||
HeaderValue::AddressList(addr_list) => {
|
||||
for addr in addr_list {
|
||||
if let Some(name) = &addr.name {
|
||||
visitor(name.as_ref(), false);
|
||||
}
|
||||
if let Some(addr) = &addr.address {
|
||||
visitor(addr.as_ref(), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
HeaderValue::Group(group) => {
|
||||
if let Some(name) = &group.name {
|
||||
visitor(name.as_ref(), false);
|
||||
}
|
||||
for addr in &group.addresses {
|
||||
if let Some(name) = &addr.name {
|
||||
visitor(name.as_ref(), false);
|
||||
}
|
||||
if let Some(addr) = &addr.address {
|
||||
visitor(addr.as_ref(), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
HeaderValue::GroupList(groups) => {
|
||||
for group in groups {
|
||||
if let Some(name) = &group.name {
|
||||
visitor(name.as_ref(), false);
|
||||
}
|
||||
for addr in &group.addresses {
|
||||
if let Some(name) = &addr.name {
|
||||
visitor(name.as_ref(), false);
|
||||
}
|
||||
if let Some(addr) = &addr.address {
|
||||
visitor(addr.as_ref(), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trait TrimTextValue {
|
||||
fn trim_text(self, length: usize) -> Self;
|
||||
}
|
||||
|
||||
impl TrimTextValue for HeaderValue<'_> {
|
||||
fn trim_text(self, length: usize) -> Self {
|
||||
match self {
|
||||
HeaderValue::Address(v) => HeaderValue::Address(v.trim_text(length)),
|
||||
HeaderValue::AddressList(v) => HeaderValue::AddressList(v.trim_text(length)),
|
||||
HeaderValue::Group(v) => HeaderValue::Group(v.trim_text(length)),
|
||||
HeaderValue::GroupList(v) => HeaderValue::GroupList(v.trim_text(length)),
|
||||
HeaderValue::Text(v) => HeaderValue::Text(v.trim_text(length)),
|
||||
HeaderValue::TextList(v) => HeaderValue::TextList(v.trim_text(length)),
|
||||
v => v,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TrimTextValue for Addr<'_> {
|
||||
fn trim_text(self, length: usize) -> Self {
|
||||
Self {
|
||||
name: self.name.map(|v| v.trim_text(length)),
|
||||
address: self.address.map(|v| v.trim_text(length)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TrimTextValue for Group<'_> {
|
||||
fn trim_text(self, length: usize) -> Self {
|
||||
Self {
|
||||
name: self.name.map(|v| v.trim_text(length)),
|
||||
addresses: self.addresses.trim_text(length),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TrimTextValue for Cow<'_, str> {
|
||||
fn trim_text(self, length: usize) -> Self {
|
||||
if self.len() < length {
|
||||
self
|
||||
} else {
|
||||
match self {
|
||||
Cow::Borrowed(v) => v.trim_text(length).into(),
|
||||
Cow::Owned(v) => v.trim_text(length).into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TrimTextValue for &str {
|
||||
fn trim_text(self, length: usize) -> Self {
|
||||
if self.len() < length {
|
||||
self
|
||||
} else {
|
||||
let mut index = 0;
|
||||
|
||||
for (i, _) in self.char_indices() {
|
||||
if i > length {
|
||||
break;
|
||||
}
|
||||
index = i;
|
||||
}
|
||||
|
||||
&self[..index]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TrimTextValue for String {
|
||||
fn trim_text(self, length: usize) -> Self {
|
||||
if self.len() < length {
|
||||
self
|
||||
} else {
|
||||
let mut result = String::with_capacity(length);
|
||||
for (i, c) in self.char_indices() {
|
||||
if i > length {
|
||||
break;
|
||||
}
|
||||
result.push(c);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TrimTextValue> TrimTextValue for Vec<T> {
|
||||
fn trim_text(self, length: usize) -> Self {
|
||||
self.into_iter().map(|v| v.trim_text(length)).collect()
|
||||
}
|
||||
}
|
3
crates/core/src/email/mod.rs
Normal file
3
crates/core/src/email/mod.rs
Normal file
|
@ -0,0 +1,3 @@
|
|||
pub mod headers;
|
||||
pub mod import;
|
||||
pub mod index;
|
|
@ -0,0 +1 @@
|
|||
pub mod email;
|
|
@ -1,10 +1,12 @@
|
|||
[package]
|
||||
name = "procotol"
|
||||
name = "protocol"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
store = { path = "../store" }
|
||||
utils = { path = "/home/vagrant/code/utils" }
|
||||
mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "serde_support", "ludicrous_mode"] }
|
||||
fast-float = "0.2.0"
|
||||
serde = { version = "1.0", features = ["derive"]}
|
||||
ahash = { version = "0.8.0", features = ["serde"] }
|
||||
|
|
|
@ -5,6 +5,7 @@ pub mod parser;
|
|||
pub mod request;
|
||||
pub mod types;
|
||||
|
||||
/*
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::BTreeMap, sync::Arc};
|
||||
|
@ -107,3 +108,4 @@ mod tests {
|
|||
}*/
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
|
|
@ -3,11 +3,95 @@ pub mod email_submission;
|
|||
pub mod mailbox;
|
||||
pub mod sieve;
|
||||
|
||||
use store::{
|
||||
write::{IntoBitmap, Operation, Tokenize},
|
||||
Serialize,
|
||||
};
|
||||
use utils::map::vec_map::VecMap;
|
||||
|
||||
use crate::types::property::Property;
|
||||
use crate::types::{property::Property, value::Value};
|
||||
|
||||
#[derive(Debug, Clone, Default, serde::Serialize)]
|
||||
#[derive(Debug, Clone, Default, serde::Serialize, PartialEq, Eq)]
|
||||
pub struct Object<T> {
|
||||
pub properties: VecMap<Property, T>,
|
||||
}
|
||||
|
||||
impl Object<Value> {
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
Self {
|
||||
properties: VecMap::with_capacity(capacity),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set(&mut self, property: Property, value: impl Into<Value>) -> bool {
|
||||
self.properties.set(property, value.into())
|
||||
}
|
||||
|
||||
pub fn append(&mut self, property: Property, value: impl Into<Value>) {
|
||||
self.properties.append(property, value.into());
|
||||
}
|
||||
|
||||
pub fn with_property(mut self, property: Property, value: impl Into<Value>) -> Self {
|
||||
self.properties.append(property, value.into());
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Value {
|
||||
fn serialize(self) -> Vec<u8> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenize for Value {
|
||||
fn tokenize(&self, ops: &mut Vec<store::write::Operation>, field: u8, set: bool) {
|
||||
match self {
|
||||
Value::Text(text) => text.as_str().tokenize(ops, field, set),
|
||||
Value::Keyword(keyword) => {
|
||||
let (key, family) = keyword.into_bitmap();
|
||||
ops.push(Operation::Bitmap {
|
||||
family,
|
||||
field,
|
||||
key,
|
||||
set,
|
||||
});
|
||||
}
|
||||
Value::UnsignedInt(int) => {
|
||||
let (key, family) = (*int as u32).into_bitmap();
|
||||
ops.push(Operation::Bitmap {
|
||||
family,
|
||||
field,
|
||||
key,
|
||||
set,
|
||||
});
|
||||
}
|
||||
Value::List(items) => {
|
||||
for item in items {
|
||||
match item {
|
||||
Value::Text(text) => text.as_str().tokenize(ops, field, set),
|
||||
Value::UnsignedInt(int) => {
|
||||
let (key, family) = (*int as u32).into_bitmap();
|
||||
ops.push(Operation::Bitmap {
|
||||
family,
|
||||
field,
|
||||
key,
|
||||
set,
|
||||
});
|
||||
}
|
||||
Value::Keyword(keyword) => {
|
||||
let (key, family) = keyword.into_bitmap();
|
||||
ops.push(Operation::Bitmap {
|
||||
family,
|
||||
field,
|
||||
key,
|
||||
set,
|
||||
})
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
use std::fmt::Display;
|
||||
|
||||
use store::{write::IntoBitmap, Serialize, BM_TAG, TAG_STATIC, TAG_TEXT};
|
||||
|
||||
use crate::parser::{json::Parser, JsonObjectParser};
|
||||
|
||||
pub const SEEN: u8 = 0;
|
||||
|
@ -114,3 +116,43 @@ impl Display for Keyword {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoBitmap for Keyword {
|
||||
fn into_bitmap(self) -> (Vec<u8>, u8) {
|
||||
match self {
|
||||
Keyword::Seen => (vec![0u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Draft => (vec![1u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Flagged => (vec![2u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Answered => (vec![3u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Recent => (vec![4u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Important => (vec![5u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Phishing => (vec![6u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Junk => (vec![7u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::NotJunk => (vec![8u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Deleted => (vec![9u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Forwarded => (vec![10u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::MdnSent => (vec![11u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Other(string) => (string.serialize(), BM_TAG | TAG_TEXT),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoBitmap for &Keyword {
|
||||
fn into_bitmap(self) -> (Vec<u8>, u8) {
|
||||
match self {
|
||||
Keyword::Seen => (vec![0u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Draft => (vec![1u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Flagged => (vec![2u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Answered => (vec![3u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Recent => (vec![4u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Important => (vec![5u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Phishing => (vec![6u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Junk => (vec![7u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::NotJunk => (vec![8u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Deleted => (vec![9u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Forwarded => (vec![10u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::MdnSent => (vec![11u8], BM_TAG | TAG_STATIC),
|
||||
Keyword::Other(string) => (string.as_str().serialize(), BM_TAG | TAG_TEXT),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use std::fmt::{Display, Formatter};
|
||||
|
||||
use mail_parser::RfcHeader;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::parser::{json::Parser, Error, JsonObjectParser};
|
||||
|
@ -805,3 +806,126 @@ impl Display for HeaderForm {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Property> for u8 {
|
||||
fn from(value: Property) -> Self {
|
||||
match value {
|
||||
Property::IsActive => 0,
|
||||
Property::IsEnabled => 1,
|
||||
Property::IsSubscribed => 2,
|
||||
Property::Keys => 3,
|
||||
Property::Keywords => 4,
|
||||
Property::Language => 5,
|
||||
Property::Location => 6,
|
||||
Property::MailboxIds => 7,
|
||||
Property::MayDelete => 8,
|
||||
Property::MdnBlobIds => 9,
|
||||
Property::Members => 10,
|
||||
Property::MessageId => 11,
|
||||
Property::MyRights => 12,
|
||||
Property::Name => 13,
|
||||
Property::ParentId => 14,
|
||||
Property::PartId => 15,
|
||||
Property::Picture => 16,
|
||||
Property::Preview => 17,
|
||||
Property::Quota => 18,
|
||||
Property::ReceivedAt => 19,
|
||||
Property::References => 20,
|
||||
Property::ReplyTo => 21,
|
||||
Property::Role => 22,
|
||||
Property::Secret => 23,
|
||||
Property::SendAt => 24,
|
||||
Property::Sender => 25,
|
||||
Property::SentAt => 26,
|
||||
Property::Size => 27,
|
||||
Property::SortOrder => 28,
|
||||
Property::Subject => 29,
|
||||
Property::SubParts => 30,
|
||||
Property::TextBody => 31,
|
||||
Property::TextSignature => 32,
|
||||
Property::ThreadId => 33,
|
||||
Property::Timezone => 34,
|
||||
Property::To => 35,
|
||||
Property::ToDate => 36,
|
||||
Property::TotalEmails => 37,
|
||||
Property::TotalThreads => 38,
|
||||
Property::Type => 39,
|
||||
Property::Types => 40,
|
||||
Property::UndoStatus => 41,
|
||||
Property::UnreadEmails => 42,
|
||||
Property::UnreadThreads => 43,
|
||||
Property::Url => 44,
|
||||
Property::VerificationCode => 45,
|
||||
Property::Parameters => 46,
|
||||
Property::Addresses => 47,
|
||||
Property::P256dh => 48,
|
||||
Property::Auth => 49,
|
||||
Property::Value => 50,
|
||||
Property::SmtpReply => 51,
|
||||
Property::Delivered => 52,
|
||||
Property::Displayed => 53,
|
||||
Property::MailFrom => 54,
|
||||
Property::RcptTo => 55,
|
||||
Property::IsEncodingProblem => 56,
|
||||
Property::IsTruncated => 57,
|
||||
Property::MayReadItems => 58,
|
||||
Property::MayAddItems => 59,
|
||||
Property::MayRemoveItems => 60,
|
||||
Property::MaySetSeen => 61,
|
||||
Property::MaySetKeywords => 62,
|
||||
Property::MayCreateChild => 63,
|
||||
Property::MayRename => 64,
|
||||
Property::MaySubmit => 65,
|
||||
Property::Acl => 66,
|
||||
Property::Aliases => 67,
|
||||
Property::Attachments => 68,
|
||||
Property::Bcc => 69,
|
||||
Property::BlobId => 70,
|
||||
Property::BodyStructure => 71,
|
||||
Property::BodyValues => 72,
|
||||
Property::Capabilities => 73,
|
||||
Property::Cc => 74,
|
||||
Property::Charset => 75,
|
||||
Property::Cid => 76,
|
||||
Property::DeliveryStatus => 77,
|
||||
Property::Description => 78,
|
||||
Property::DeviceClientId => 79,
|
||||
Property::Disposition => 80,
|
||||
Property::DsnBlobIds => 81,
|
||||
Property::Email => 82,
|
||||
Property::EmailId => 83,
|
||||
Property::EmailIds => 84,
|
||||
Property::Envelope => 85,
|
||||
Property::Expires => 86,
|
||||
Property::From => 87,
|
||||
Property::FromDate => 88,
|
||||
Property::HasAttachment => 89,
|
||||
Property::Header(_) => 90,
|
||||
Property::Headers => 91,
|
||||
Property::HtmlBody => 92,
|
||||
Property::HtmlSignature => 93,
|
||||
Property::Id => 94,
|
||||
Property::IdentityId => 95,
|
||||
Property::InReplyTo => 96,
|
||||
Property::_T(_) => 97,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RfcHeader> for Property {
|
||||
fn from(value: RfcHeader) -> Self {
|
||||
match value {
|
||||
RfcHeader::Subject => Property::Subject,
|
||||
RfcHeader::From => Property::From,
|
||||
RfcHeader::To => Property::To,
|
||||
RfcHeader::Cc => Property::Cc,
|
||||
RfcHeader::Date => Property::SentAt,
|
||||
RfcHeader::Bcc => Property::Bcc,
|
||||
RfcHeader::ReplyTo => Property::ReplyTo,
|
||||
RfcHeader::Sender => Property::Sender,
|
||||
RfcHeader::InReplyTo => Property::InReplyTo,
|
||||
RfcHeader::MessageId => Property::MessageId,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
use std::fmt::Display;
|
||||
use std::{borrow::Cow, fmt::Display};
|
||||
|
||||
use mail_parser::{Addr, DateTime, Group};
|
||||
use serde::Serialize;
|
||||
use utils::map::vec_map::VecMap;
|
||||
|
||||
use crate::{
|
||||
object::Object,
|
||||
parser::{json::Parser, Ignore, JsonObjectParser, Token},
|
||||
request::reference::ResultReference,
|
||||
};
|
||||
|
@ -30,7 +31,7 @@ pub enum Value {
|
|||
TypeState(TypeState),
|
||||
Acl(Acl),
|
||||
List(Vec<Value>),
|
||||
Object(VecMap<Property, Value>),
|
||||
Object(Object<Value>),
|
||||
Null,
|
||||
}
|
||||
|
||||
|
@ -57,7 +58,7 @@ impl Value {
|
|||
Ok(match parser.next_token::<V>()? {
|
||||
Token::String(v) => v.into_value(),
|
||||
Token::DictStart => {
|
||||
let mut properties = VecMap::with_capacity(4);
|
||||
let mut properties = Object::with_capacity(4);
|
||||
while {
|
||||
let property = parser.next_dict_key::<K>()?.into_property();
|
||||
let value = Value::from_property(parser, &property)?;
|
||||
|
@ -204,3 +205,102 @@ impl IntoValue for TypeState {
|
|||
Value::TypeState(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<usize> for Value {
|
||||
fn from(value: usize) -> Self {
|
||||
Value::UnsignedInt(value as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for Value {
|
||||
fn from(value: u64) -> Self {
|
||||
Value::UnsignedInt(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u32> for Value {
|
||||
fn from(value: u32) -> Self {
|
||||
Value::UnsignedInt(value as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for Value {
|
||||
fn from(value: String) -> Self {
|
||||
Value::Text(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<bool> for Value {
|
||||
fn from(value: bool) -> Self {
|
||||
Value::Bool(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Keyword> for Value {
|
||||
fn from(value: Keyword) -> Self {
|
||||
Value::Keyword(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Object<Value>> for Value {
|
||||
fn from(value: Object<Value>) -> Self {
|
||||
Value::Object(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DateTime> for Value {
|
||||
fn from(date: DateTime) -> Self {
|
||||
Value::Date(UTCDate {
|
||||
year: date.year,
|
||||
month: date.month,
|
||||
day: date.day,
|
||||
hour: date.hour,
|
||||
minute: date.minute,
|
||||
second: date.second,
|
||||
tz_before_gmt: date.tz_before_gmt,
|
||||
tz_hour: date.tz_hour,
|
||||
tz_minute: date.tz_minute,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Cow<'_, str>> for Value {
|
||||
fn from(value: Cow<'_, str>) -> Self {
|
||||
Value::Text(value.into_owned())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Into<Value>> From<Vec<T>> for Value {
|
||||
fn from(value: Vec<T>) -> Self {
|
||||
Value::List(value.into_iter().map(|v| v.into()).collect())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Into<Value>> From<Option<T>> for Value {
|
||||
fn from(value: Option<T>) -> Self {
|
||||
match value {
|
||||
Some(value) => value.into(),
|
||||
None => Value::Null,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Addr<'_>> for Value {
|
||||
fn from(value: Addr<'_>) -> Self {
|
||||
Value::Object(
|
||||
Object::with_capacity(2)
|
||||
.with_property(Property::Name, value.name)
|
||||
.with_property(Property::Email, value.address.unwrap_or_default()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Group<'_>> for Value {
|
||||
fn from(group: Group<'_>) -> Self {
|
||||
Value::Object(
|
||||
Object::with_capacity(2)
|
||||
.with_property(Property::Name, group.name)
|
||||
.with_property(Property::Addresses, group.addresses),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ utils = { path = "/home/vagrant/code/utils" }
|
|||
rocksdb = { version = "0.20.1", optional = true }
|
||||
foundationdb = { version = "0.7.0", optional = true }
|
||||
rusqlite = { version = "0.29.0", features = ["bundled"], optional = true }
|
||||
tokio = { version = "1.23", features = ["sync"], optional = true }
|
||||
tokio = { version = "1.23", features = ["sync", "fs", "io-util"], optional = true }
|
||||
r2d2 = { version = "0.8.10", optional = true }
|
||||
futures = { version = "0.3", optional = true }
|
||||
rand = "0.8.5"
|
||||
|
@ -37,4 +37,4 @@ sqlite = ["rusqlite", "rayon", "r2d2", "tokio", "is_sync"]
|
|||
foundation = ["foundationdb", "futures", "is_async"]
|
||||
is_sync = ["maybe-async/is_sync", "parking_lot", "lru-cache"]
|
||||
is_async = []
|
||||
|
||||
test_mode = []
|
||||
|
|
|
@ -22,9 +22,9 @@ use super::{
|
|||
SUBSPACE_VALUES,
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
#[cfg(feature = "test_mode")]
|
||||
const ID_ASSIGNMENT_EXPIRY: u64 = 2; // seconds
|
||||
#[cfg(not(test))]
|
||||
#[cfg(not(feature = "test_mode"))]
|
||||
pub const ID_ASSIGNMENT_EXPIRY: u64 = 60 * 60; // seconds
|
||||
|
||||
const MAX_COMMIT_ATTEMPTS: u8 = 10;
|
||||
|
|
|
@ -4,14 +4,15 @@ use ahash::AHashSet;
|
|||
|
||||
use crate::{
|
||||
write::{BatchBuilder, IntoOperations, Operation},
|
||||
Serialize, BLOOM_BIGRAM, BLOOM_TRIGRAM, BM_HASH, HASH_EXACT, HASH_STEMMED,
|
||||
Serialize, BLOOM_BIGRAM, BLOOM_TRIGRAM, HASH_EXACT, HASH_STEMMED,
|
||||
};
|
||||
|
||||
use super::{
|
||||
bloom::{hash_token, BloomFilter},
|
||||
bloom::BloomFilter,
|
||||
lang::{LanguageDetector, MIN_LANGUAGE_SCORE},
|
||||
ngram::ToNgrams,
|
||||
stemmer::Stemmer,
|
||||
tokenizers::space::SpaceTokenizer,
|
||||
Language,
|
||||
};
|
||||
|
||||
|
@ -26,6 +27,7 @@ struct Text<'x> {
|
|||
|
||||
pub struct FtsIndexBuilder<'x> {
|
||||
parts: Vec<Text<'x>>,
|
||||
tokens: AHashSet<(u8, String)>,
|
||||
detect: LanguageDetector,
|
||||
default_language: Language,
|
||||
}
|
||||
|
@ -35,6 +37,7 @@ impl<'x> FtsIndexBuilder<'x> {
|
|||
FtsIndexBuilder {
|
||||
parts: vec![],
|
||||
detect: LanguageDetector::new(),
|
||||
tokens: AHashSet::new(),
|
||||
default_language,
|
||||
}
|
||||
}
|
||||
|
@ -55,6 +58,13 @@ impl<'x> FtsIndexBuilder<'x> {
|
|||
language,
|
||||
});
|
||||
}
|
||||
|
||||
pub fn index_raw(&mut self, field: impl Into<u8>, text: &str) {
|
||||
let field = field.into();
|
||||
for token in SpaceTokenizer::new(text, MAX_TOKEN_LENGTH) {
|
||||
self.tokens.insert((field, token));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> IntoOperations for FtsIndexBuilder<'x> {
|
||||
|
@ -82,12 +92,9 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
|
|||
}
|
||||
|
||||
for (word, family) in unique_words {
|
||||
batch.ops.push(Operation::Bitmap {
|
||||
family: BM_HASH | family | (word.len() & MAX_TOKEN_MASK) as u8,
|
||||
field: part.field,
|
||||
key: hash_token(&word),
|
||||
set: true,
|
||||
});
|
||||
batch
|
||||
.ops
|
||||
.push(Operation::hash(&word, family, part.field, true));
|
||||
}
|
||||
|
||||
if phrase_words.len() > 1 {
|
||||
|
@ -106,6 +113,12 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
|
|||
}
|
||||
}
|
||||
|
||||
for (field, token) in self.tokens {
|
||||
batch
|
||||
.ops
|
||||
.push(Operation::hash(&token, HASH_EXACT, field, true));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use crate::{BitmapKey, BM_HASH};
|
||||
use crate::{write::Operation, BitmapKey, BM_HASH};
|
||||
|
||||
use self::{bloom::hash_token, builder::MAX_TOKEN_MASK};
|
||||
|
||||
|
@ -179,3 +179,14 @@ impl BitmapKey<Vec<u8>> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Operation {
|
||||
pub fn hash(word: &str, family: u8, field: u8, set: bool) -> Self {
|
||||
Operation::Bitmap {
|
||||
family: BM_HASH | family | (word.len() & MAX_TOKEN_MASK) as u8,
|
||||
field,
|
||||
key: hash_token(word),
|
||||
set,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
pub mod chinese;
|
||||
pub mod indo_european;
|
||||
pub mod japanese;
|
||||
pub mod space;
|
||||
pub mod word;
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
|
51
crates/store/src/fts/tokenizers/space.rs
Normal file
51
crates/store/src/fts/tokenizers/space.rs
Normal file
|
@ -0,0 +1,51 @@
|
|||
use std::str::Chars;
|
||||
|
||||
pub struct SpaceTokenizer<'x> {
|
||||
iterator: Chars<'x>,
|
||||
token: String,
|
||||
max_token_length: usize,
|
||||
}
|
||||
|
||||
impl SpaceTokenizer<'_> {
|
||||
pub fn new(text: &str, max_token_length: usize) -> SpaceTokenizer {
|
||||
SpaceTokenizer {
|
||||
iterator: text.chars(),
|
||||
token: String::new(),
|
||||
max_token_length,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for SpaceTokenizer<'_> {
|
||||
type Item = String;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
for ch in self.iterator.by_ref() {
|
||||
if ch.is_alphanumeric() {
|
||||
if ch.is_uppercase() {
|
||||
for ch in ch.to_lowercase() {
|
||||
self.token.push(ch);
|
||||
}
|
||||
} else {
|
||||
self.token.push(ch);
|
||||
}
|
||||
} else if !self.token.is_empty() {
|
||||
if self.token.len() < self.max_token_length {
|
||||
return Some(std::mem::take(&mut self.token));
|
||||
} else {
|
||||
self.token.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !self.token.is_empty() {
|
||||
if self.token.len() < self.max_token_length {
|
||||
return Some(std::mem::take(&mut self.token));
|
||||
} else {
|
||||
self.token.clear();
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
|
@ -1,8 +1,12 @@
|
|||
use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};
|
||||
|
||||
use ahash::HashSet;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{write::Tokenize, BitmapKey, ReadTransaction, Store, BM_KEYWORD};
|
||||
use crate::{
|
||||
fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
|
||||
BitmapKey, ReadTransaction, Store, BM_KEYWORD,
|
||||
};
|
||||
|
||||
use super::{Filter, ResultSet};
|
||||
|
||||
|
@ -53,17 +57,10 @@ impl ReadTransaction<'_> {
|
|||
}
|
||||
Filter::HasKeywords { field, value } => {
|
||||
self.get_bitmaps_intersection(
|
||||
value
|
||||
.tokenize()
|
||||
SpaceTokenizer::new(&value, MAX_TOKEN_LENGTH)
|
||||
.collect::<HashSet<String>>()
|
||||
.into_iter()
|
||||
.map(|key| BitmapKey {
|
||||
account_id,
|
||||
collection,
|
||||
family: BM_KEYWORD,
|
||||
field,
|
||||
key: key.into_bytes(),
|
||||
block_num: 0,
|
||||
})
|
||||
.map(|word| BitmapKey::hash(&word, account_id, collection, 0, field))
|
||||
.collect(),
|
||||
)
|
||||
.await?
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use crate::{BM_DOCUMENT_IDS, BM_KEYWORD};
|
||||
use crate::BM_DOCUMENT_IDS;
|
||||
|
||||
use super::{
|
||||
Batch, BatchBuilder, HasFlag, IntoBitmap, IntoOperations, Operation, Serialize, Tokenize,
|
||||
|
@ -68,14 +68,7 @@ impl BatchBuilder {
|
|||
let is_set = !options.has_flag(F_CLEAR);
|
||||
|
||||
if options.has_flag(F_TOKENIZE) {
|
||||
for token in value.tokenize() {
|
||||
self.ops.push(Operation::Bitmap {
|
||||
family: BM_KEYWORD,
|
||||
field,
|
||||
key: token.into_bytes(),
|
||||
set: is_set,
|
||||
});
|
||||
}
|
||||
value.tokenize(&mut self.ops, field, is_set);
|
||||
}
|
||||
|
||||
let value = value.serialize();
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
use std::{collections::HashSet, time::SystemTime};
|
||||
|
||||
use crate::{Deserialize, Serialize};
|
||||
use crate::{
|
||||
fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
|
||||
Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC, TAG_TEXT,
|
||||
};
|
||||
|
||||
pub mod batch;
|
||||
pub mod key;
|
||||
|
@ -130,55 +133,43 @@ impl HasFlag for u32 {
|
|||
}
|
||||
|
||||
pub trait Tokenize {
|
||||
fn tokenize(&self) -> HashSet<String>;
|
||||
fn tokenize(&self, ops: &mut Vec<Operation>, field: u8, set: bool);
|
||||
}
|
||||
|
||||
impl Tokenize for &str {
|
||||
fn tokenize(&self) -> HashSet<String> {
|
||||
fn tokenize(&self, ops: &mut Vec<Operation>, field: u8, set: bool) {
|
||||
let mut tokens = HashSet::new();
|
||||
let mut token = String::new();
|
||||
|
||||
for ch in self.chars() {
|
||||
if ch.is_alphanumeric() {
|
||||
if ch.is_uppercase() {
|
||||
token.push(ch.to_lowercase().next().unwrap());
|
||||
} else {
|
||||
token.push(ch);
|
||||
}
|
||||
} else if !token.is_empty() {
|
||||
tokens.insert(token);
|
||||
token = String::new();
|
||||
}
|
||||
}
|
||||
|
||||
if !token.is_empty() {
|
||||
for token in SpaceTokenizer::new(self, MAX_TOKEN_LENGTH) {
|
||||
tokens.insert(token);
|
||||
}
|
||||
|
||||
tokens
|
||||
for token in tokens {
|
||||
ops.push(Operation::hash(&token, HASH_EXACT, field, set));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenize for String {
|
||||
fn tokenize(&self) -> HashSet<String> {
|
||||
self.as_str().tokenize()
|
||||
fn tokenize(&self, ops: &mut Vec<Operation>, field: u8, set: bool) {
|
||||
self.as_str().tokenize(ops, field, set)
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenize for u32 {
|
||||
fn tokenize(&self) -> HashSet<String> {
|
||||
fn tokenize(&self, _ops: &mut Vec<Operation>, _field: u8, _set: bool) {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenize for u64 {
|
||||
fn tokenize(&self) -> HashSet<String> {
|
||||
fn tokenize(&self, _ops: &mut Vec<Operation>, _field: u8, _set: bool) {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenize for f64 {
|
||||
fn tokenize(&self) -> HashSet<String> {
|
||||
fn tokenize(&self, _ops: &mut Vec<Operation>, _field: u8, _set: bool) {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
@ -187,6 +178,30 @@ pub trait IntoBitmap {
|
|||
fn into_bitmap(self) -> (Vec<u8>, u8);
|
||||
}
|
||||
|
||||
impl IntoBitmap for () {
|
||||
fn into_bitmap(self) -> (Vec<u8>, u8) {
|
||||
(vec![], BM_TAG | TAG_STATIC)
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoBitmap for u32 {
|
||||
fn into_bitmap(self) -> (Vec<u8>, u8) {
|
||||
(self.serialize(), BM_TAG | TAG_ID)
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoBitmap for String {
|
||||
fn into_bitmap(self) -> (Vec<u8>, u8) {
|
||||
(self.serialize(), BM_TAG | TAG_TEXT)
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoBitmap for &str {
|
||||
fn into_bitmap(self) -> (Vec<u8>, u8) {
|
||||
(self.serialize(), BM_TAG | TAG_TEXT)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait IntoOperations {
|
||||
fn build(self, batch: &mut BatchBuilder) -> crate::Result<()>;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue