Message indexing.

This commit is contained in:
Mauro D 2023-04-05 16:05:21 +00:00
parent 9f63c86db9
commit 0778e446a6
21 changed files with 1115 additions and 64 deletions

View file

@ -4,6 +4,9 @@ version = "0.1.0"
edition = "2021"
[dependencies]
store = { path = "../store" }
protocol = { path = "../protocol" }
utils = { path = "/home/vagrant/code/utils" }
mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "serde_support", "ludicrous_mode"] }
mail-builder = { git = "https://github.com/stalwartlabs/mail-builder", features = ["ludicrous_mode"] }
mail-send = { git = "https://github.com/stalwartlabs/mail-send" }

View file

@ -0,0 +1,74 @@
use mail_parser::{Addr, HeaderValue};
use protocol::{
object::Object,
types::{
property::{HeaderForm, Property},
value::Value,
},
};
pub trait IntoForm {
fn into_form(self, form: &HeaderForm) -> Value;
}
impl IntoForm for HeaderValue<'_> {
fn into_form(self, form: &HeaderForm) -> Value {
match (self, form) {
(HeaderValue::Text(text), HeaderForm::Raw | HeaderForm::Text) => text.into(),
(HeaderValue::TextList(texts), HeaderForm::Raw | HeaderForm::Text) => {
texts.join(", ").into()
}
(HeaderValue::Text(text), HeaderForm::MessageIds) => Value::List(vec![text.into()]),
(HeaderValue::TextList(texts), HeaderForm::MessageIds) => texts.into(),
(HeaderValue::DateTime(datetime), HeaderForm::Date) => datetime.into(),
(
HeaderValue::Address(Addr {
address: Some(addr),
..
}),
HeaderForm::URLs,
) if addr.contains(':') => Value::List(vec![addr.into()]),
(HeaderValue::AddressList(addrlist), HeaderForm::URLs) => Value::List(
addrlist
.into_iter()
.filter_map(|addr| match addr {
Addr {
address: Some(addr),
..
} if addr.contains(':') => Some(addr.into()),
_ => None,
})
.collect(),
),
(HeaderValue::Address(addr), HeaderForm::Addresses) => Value::List(vec![addr.into()]),
(HeaderValue::AddressList(addrlist), HeaderForm::Addresses) => addrlist.into(),
(HeaderValue::Group(group), HeaderForm::Addresses) => group.addresses.into(),
(HeaderValue::GroupList(grouplist), HeaderForm::Addresses) => Value::List(
grouplist
.into_iter()
.flat_map(|group| group.addresses)
.map(Into::into)
.collect(),
),
(HeaderValue::Address(addr), HeaderForm::GroupedAddresses) => {
Value::List(vec![Object::with_capacity(2)
.with_property(Property::Name, Value::Null)
.with_property(Property::Addresses, Value::List(vec![addr.into()]))
.into()])
}
(HeaderValue::AddressList(addrlist), HeaderForm::GroupedAddresses) => {
Value::List(vec![Object::with_capacity(2)
.with_property(Property::Name, Value::Null)
.with_property(Property::Addresses, addrlist)
.into()])
}
(HeaderValue::Group(group), HeaderForm::GroupedAddresses) => {
Value::List(vec![group.into()])
}
(HeaderValue::GroupList(grouplist), HeaderForm::GroupedAddresses) => grouplist.into(),
_ => Value::Null,
}
}
}

View file

@ -0,0 +1 @@

View file

@ -0,0 +1,534 @@
use std::borrow::Cow;
use mail_parser::{
decoders::html::html_to_text,
parsers::{fields::thread::thread_name, preview::preview_text},
Addr, GetHeader, Group, HeaderName, HeaderValue, Message, MessagePart, PartType, RfcHeader,
};
use protocol::{
object::Object,
types::{
date::UTCDate,
keyword::Keyword,
property::{HeaderForm, Property},
value::Value,
},
};
use store::{
fts::{builder::FtsIndexBuilder, Language},
write::{BatchBuilder, F_INDEX, F_TOKENIZE, F_VALUE},
};
use crate::email::headers::IntoForm;
pub const MAX_MESSAGE_PARTS: usize = 1000;
pub const MAX_ID_LENGTH: usize = 100;
pub const MAX_SORT_FIELD_LENGTH: usize = 255;
pub const MAX_STORED_FIELD_LENGTH: usize = 512;
pub const PREVIEW_LENGTH: usize = 256;
trait IndexMessage {
fn index_message(
&mut self,
message: Message,
keywords: Vec<Keyword>,
mailbox_ids: Vec<u32>,
received_at: u64,
default_language: Language,
) -> store::Result<()>;
}
/*
o id
o blobId
o threadId
o mailboxIds
o keywords
o receivedAt
*/
impl IndexMessage for BatchBuilder {
fn index_message(
&mut self,
message: Message,
keywords: Vec<Keyword>,
mailbox_ids: Vec<u32>,
received_at: u64,
default_language: Language,
) -> store::Result<()> {
let mut object = Object::with_capacity(10);
// Index keywords
self.value(
Property::Keywords,
Value::from(keywords),
F_VALUE | F_TOKENIZE,
);
// Index mailboxIds
self.value(
Property::MailboxIds,
Value::from(mailbox_ids),
F_VALUE | F_TOKENIZE,
);
// Index size
object.append(Property::Size, message.raw_message.len());
self.value(Property::Size, message.raw_message.len() as u32, F_INDEX);
// Index receivedAt
object.append(
Property::ReceivedAt,
Value::Date(UTCDate::from_timestamp(received_at as i64)),
);
self.value(Property::ReceivedAt, received_at, F_INDEX);
let mut fts = FtsIndexBuilder::with_default_language(default_language);
let mut seen_headers = [false; 40];
let mut language = Language::Unknown;
let mut has_attachments = false;
let preview_part_id = message
.text_body
.first()
.or_else(|| message.html_body.first())
.copied()
.unwrap_or(usize::MAX);
for (part_id, part) in message
.parts
.into_iter()
.take(MAX_MESSAGE_PARTS)
.enumerate()
{
let part_language = part.language().unwrap_or(language);
if part_id == 0 {
language = part_language;
for header in part.headers.into_iter().rev() {
if let HeaderName::Rfc(rfc_header) = header.name {
match rfc_header {
RfcHeader::MessageId
| RfcHeader::InReplyTo
| RfcHeader::References
| RfcHeader::ResentMessageId => {
match &header.value {
HeaderValue::Text(id) if id.len() < MAX_ID_LENGTH => {
self.value(Property::MessageId, id.as_ref(), F_INDEX);
}
HeaderValue::TextList(ids) => {
for id in ids {
if id.len() < MAX_ID_LENGTH {
self.value(
Property::MessageId,
id.as_ref(),
F_INDEX,
);
}
}
}
_ => (),
}
if matches!(rfc_header, RfcHeader::MessageId | RfcHeader::InReplyTo)
&& !seen_headers[rfc_header as usize]
{
object.append(
rfc_header.into(),
header
.value
.trim_text(MAX_STORED_FIELD_LENGTH)
.into_form(&HeaderForm::MessageIds),
);
seen_headers[rfc_header as usize] = true;
}
}
RfcHeader::From
| RfcHeader::To
| RfcHeader::Cc
| RfcHeader::Bcc
| RfcHeader::ReplyTo
| RfcHeader::Sender => {
let seen_header = seen_headers[rfc_header as usize];
if matches!(
rfc_header,
RfcHeader::From
| RfcHeader::To
| RfcHeader::Cc
| RfcHeader::Bcc
) {
let mut sort_text =
String::with_capacity(MAX_SORT_FIELD_LENGTH);
let mut found_addr = seen_header;
let mut last_is_space = true;
header.value.visit_addresses(|value, is_addr| {
if !found_addr {
if !sort_text.is_empty() {
sort_text.push(' ');
last_is_space = true;
}
found_addr = is_addr;
'outer: for ch in value.chars() {
for ch in ch.to_lowercase() {
if sort_text.len() < MAX_SORT_FIELD_LENGTH {
let is_space = ch.is_whitespace();
if !is_space || !last_is_space {
sort_text.push(ch);
last_is_space = is_space;
}
} else {
found_addr = true;
break 'outer;
}
}
}
}
// Index an address name or email without stemming
fts.index_raw(rfc_header, value);
});
if !seen_header {
// Add address to inverted index
self.value(
rfc_header,
if !sort_text.is_empty() {
&sort_text
} else {
"!"
},
F_INDEX,
);
}
}
if !seen_header {
// Add address to object
object.append(
rfc_header.into(),
header
.value
.trim_text(MAX_STORED_FIELD_LENGTH)
.into_form(&HeaderForm::Addresses),
);
seen_headers[rfc_header as usize] = true;
}
}
RfcHeader::Date => {
if !seen_headers[rfc_header as usize] {
if let HeaderValue::DateTime(datetime) = &header.value {
self.value(
Property::SentAt,
datetime.to_timestamp() as u64,
F_INDEX,
);
}
object.append(
Property::SentAt,
header.value.into_form(&HeaderForm::Date),
);
seen_headers[rfc_header as usize] = true;
}
}
RfcHeader::Subject => {
// Index subject
let subject = match &header.value {
HeaderValue::Text(text) => text.clone(),
HeaderValue::TextList(list) if !list.is_empty() => {
list.first().unwrap().clone()
}
_ => "".into(),
};
if !seen_headers[rfc_header as usize] {
// Add to object
object.append(
Property::Subject,
header
.value
.trim_text(MAX_STORED_FIELD_LENGTH)
.into_form(&HeaderForm::Text),
);
// Index thread name
let thread_name = thread_name(&subject);
self.value(
Property::Subject,
if !thread_name.is_empty() {
thread_name.trim_text(MAX_SORT_FIELD_LENGTH)
} else {
"!"
},
F_INDEX,
);
seen_headers[rfc_header as usize] = true;
}
// Index subject for FTS
fts.index(Property::Subject, subject, language);
}
_ => (),
}
}
}
}
match part.body {
PartType::Text(text) => {
if part_id == preview_part_id {
object.append(
Property::Preview,
preview_text(text.clone(), PREVIEW_LENGTH),
);
}
if message.text_body.contains(&part_id) || message.html_body.contains(&part_id)
{
fts.index(Property::TextBody, text, part_language);
} else {
fts.index(Property::Attachments, text, part_language);
has_attachments = true;
}
}
PartType::Html(html) => {
let text = html_to_text(&html);
if part_id == preview_part_id {
object.append(
Property::Preview,
preview_text(text.clone().into(), PREVIEW_LENGTH),
);
}
if message.text_body.contains(&part_id) || message.html_body.contains(&part_id)
{
fts.index(Property::TextBody, text, part_language);
} else {
fts.index(Property::Attachments, text, part_language);
has_attachments = true;
}
}
PartType::Binary(_) if !has_attachments => {
has_attachments = true;
}
PartType::Message(mut nested_message) => {
let nested_message_language = nested_message
.root_part()
.language()
.unwrap_or(Language::Unknown);
if let Some(HeaderValue::Text(subject)) =
nested_message.remove_header_rfc(RfcHeader::Subject)
{
fts.index(
Property::Attachments,
subject.into_owned(),
nested_message_language,
);
}
for sub_part in nested_message.parts.into_iter().take(MAX_MESSAGE_PARTS) {
let language = sub_part.language().unwrap_or(nested_message_language);
match sub_part.body {
PartType::Text(text) => {
fts.index(Property::Attachments, text, language);
}
PartType::Html(html) => {
fts.index(Property::Attachments, html_to_text(&html), language);
}
_ => (),
}
}
if !has_attachments {
has_attachments = true;
}
}
_ => {}
}
}
// Store and index hasAttachment property
object.append(Property::HasAttachment, has_attachments);
if has_attachments {
self.bitmap(Property::HasAttachment, (), 0);
}
// Store properties
self.value(Property::BodyStructure, Value::from(object), F_VALUE);
// Store full text index
self.custom(fts)?;
Ok(())
}
}
trait GetContentLanguage {
fn language(&self) -> Option<Language>;
}
impl GetContentLanguage for MessagePart<'_> {
fn language(&self) -> Option<Language> {
self.headers.rfc(&RfcHeader::ContentLanguage).and_then(|v| {
Language::from_iso_639(match v {
HeaderValue::Text(v) => v.as_ref(),
HeaderValue::TextList(v) => v.first()?,
_ => {
return None;
}
})
.unwrap_or(Language::Unknown)
.into()
})
}
}
trait VisitAddresses {
fn visit_addresses(&self, visitor: impl FnMut(&str, bool));
}
impl VisitAddresses for HeaderValue<'_> {
fn visit_addresses(&self, mut visitor: impl FnMut(&str, bool)) {
match self {
HeaderValue::Address(addr) => {
if let Some(name) = &addr.name {
visitor(name.as_ref(), false);
}
if let Some(addr) = &addr.address {
visitor(addr.as_ref(), true);
}
}
HeaderValue::AddressList(addr_list) => {
for addr in addr_list {
if let Some(name) = &addr.name {
visitor(name.as_ref(), false);
}
if let Some(addr) = &addr.address {
visitor(addr.as_ref(), true);
}
}
}
HeaderValue::Group(group) => {
if let Some(name) = &group.name {
visitor(name.as_ref(), false);
}
for addr in &group.addresses {
if let Some(name) = &addr.name {
visitor(name.as_ref(), false);
}
if let Some(addr) = &addr.address {
visitor(addr.as_ref(), true);
}
}
}
HeaderValue::GroupList(groups) => {
for group in groups {
if let Some(name) = &group.name {
visitor(name.as_ref(), false);
}
for addr in &group.addresses {
if let Some(name) = &addr.name {
visitor(name.as_ref(), false);
}
if let Some(addr) = &addr.address {
visitor(addr.as_ref(), true);
}
}
}
}
_ => (),
}
}
}
trait TrimTextValue {
fn trim_text(self, length: usize) -> Self;
}
impl TrimTextValue for HeaderValue<'_> {
fn trim_text(self, length: usize) -> Self {
match self {
HeaderValue::Address(v) => HeaderValue::Address(v.trim_text(length)),
HeaderValue::AddressList(v) => HeaderValue::AddressList(v.trim_text(length)),
HeaderValue::Group(v) => HeaderValue::Group(v.trim_text(length)),
HeaderValue::GroupList(v) => HeaderValue::GroupList(v.trim_text(length)),
HeaderValue::Text(v) => HeaderValue::Text(v.trim_text(length)),
HeaderValue::TextList(v) => HeaderValue::TextList(v.trim_text(length)),
v => v,
}
}
}
impl TrimTextValue for Addr<'_> {
fn trim_text(self, length: usize) -> Self {
Self {
name: self.name.map(|v| v.trim_text(length)),
address: self.address.map(|v| v.trim_text(length)),
}
}
}
impl TrimTextValue for Group<'_> {
fn trim_text(self, length: usize) -> Self {
Self {
name: self.name.map(|v| v.trim_text(length)),
addresses: self.addresses.trim_text(length),
}
}
}
impl TrimTextValue for Cow<'_, str> {
fn trim_text(self, length: usize) -> Self {
if self.len() < length {
self
} else {
match self {
Cow::Borrowed(v) => v.trim_text(length).into(),
Cow::Owned(v) => v.trim_text(length).into(),
}
}
}
}
impl TrimTextValue for &str {
fn trim_text(self, length: usize) -> Self {
if self.len() < length {
self
} else {
let mut index = 0;
for (i, _) in self.char_indices() {
if i > length {
break;
}
index = i;
}
&self[..index]
}
}
}
impl TrimTextValue for String {
fn trim_text(self, length: usize) -> Self {
if self.len() < length {
self
} else {
let mut result = String::with_capacity(length);
for (i, c) in self.char_indices() {
if i > length {
break;
}
result.push(c);
}
result
}
}
}
impl<T: TrimTextValue> TrimTextValue for Vec<T> {
fn trim_text(self, length: usize) -> Self {
self.into_iter().map(|v| v.trim_text(length)).collect()
}
}

View file

@ -0,0 +1,3 @@
pub mod headers;
pub mod import;
pub mod index;

View file

@ -0,0 +1 @@
pub mod email;

View file

@ -1,10 +1,12 @@
[package]
name = "procotol"
name = "protocol"
version = "0.1.0"
edition = "2021"
[dependencies]
store = { path = "../store" }
utils = { path = "/home/vagrant/code/utils" }
mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "serde_support", "ludicrous_mode"] }
fast-float = "0.2.0"
serde = { version = "1.0", features = ["derive"]}
ahash = { version = "0.8.0", features = ["serde"] }

View file

@ -5,6 +5,7 @@ pub mod parser;
pub mod request;
pub mod types;
/*
#[cfg(test)]
mod tests {
use std::{collections::BTreeMap, sync::Arc};
@ -107,3 +108,4 @@ mod tests {
}*/
}
}
*/

View file

@ -3,11 +3,95 @@ pub mod email_submission;
pub mod mailbox;
pub mod sieve;
use store::{
write::{IntoBitmap, Operation, Tokenize},
Serialize,
};
use utils::map::vec_map::VecMap;
use crate::types::property::Property;
use crate::types::{property::Property, value::Value};
#[derive(Debug, Clone, Default, serde::Serialize)]
#[derive(Debug, Clone, Default, serde::Serialize, PartialEq, Eq)]
pub struct Object<T> {
pub properties: VecMap<Property, T>,
}
impl Object<Value> {
pub fn with_capacity(capacity: usize) -> Self {
Self {
properties: VecMap::with_capacity(capacity),
}
}
pub fn set(&mut self, property: Property, value: impl Into<Value>) -> bool {
self.properties.set(property, value.into())
}
pub fn append(&mut self, property: Property, value: impl Into<Value>) {
self.properties.append(property, value.into());
}
pub fn with_property(mut self, property: Property, value: impl Into<Value>) -> Self {
self.properties.append(property, value.into());
self
}
}
impl Serialize for Value {
fn serialize(self) -> Vec<u8> {
todo!()
}
}
impl Tokenize for Value {
fn tokenize(&self, ops: &mut Vec<store::write::Operation>, field: u8, set: bool) {
match self {
Value::Text(text) => text.as_str().tokenize(ops, field, set),
Value::Keyword(keyword) => {
let (key, family) = keyword.into_bitmap();
ops.push(Operation::Bitmap {
family,
field,
key,
set,
});
}
Value::UnsignedInt(int) => {
let (key, family) = (*int as u32).into_bitmap();
ops.push(Operation::Bitmap {
family,
field,
key,
set,
});
}
Value::List(items) => {
for item in items {
match item {
Value::Text(text) => text.as_str().tokenize(ops, field, set),
Value::UnsignedInt(int) => {
let (key, family) = (*int as u32).into_bitmap();
ops.push(Operation::Bitmap {
family,
field,
key,
set,
});
}
Value::Keyword(keyword) => {
let (key, family) = keyword.into_bitmap();
ops.push(Operation::Bitmap {
family,
field,
key,
set,
})
}
_ => (),
}
}
}
_ => (),
}
}
}

View file

@ -1,5 +1,7 @@
use std::fmt::Display;
use store::{write::IntoBitmap, Serialize, BM_TAG, TAG_STATIC, TAG_TEXT};
use crate::parser::{json::Parser, JsonObjectParser};
pub const SEEN: u8 = 0;
@ -114,3 +116,43 @@ impl Display for Keyword {
}
}
}
impl IntoBitmap for Keyword {
fn into_bitmap(self) -> (Vec<u8>, u8) {
match self {
Keyword::Seen => (vec![0u8], BM_TAG | TAG_STATIC),
Keyword::Draft => (vec![1u8], BM_TAG | TAG_STATIC),
Keyword::Flagged => (vec![2u8], BM_TAG | TAG_STATIC),
Keyword::Answered => (vec![3u8], BM_TAG | TAG_STATIC),
Keyword::Recent => (vec![4u8], BM_TAG | TAG_STATIC),
Keyword::Important => (vec![5u8], BM_TAG | TAG_STATIC),
Keyword::Phishing => (vec![6u8], BM_TAG | TAG_STATIC),
Keyword::Junk => (vec![7u8], BM_TAG | TAG_STATIC),
Keyword::NotJunk => (vec![8u8], BM_TAG | TAG_STATIC),
Keyword::Deleted => (vec![9u8], BM_TAG | TAG_STATIC),
Keyword::Forwarded => (vec![10u8], BM_TAG | TAG_STATIC),
Keyword::MdnSent => (vec![11u8], BM_TAG | TAG_STATIC),
Keyword::Other(string) => (string.serialize(), BM_TAG | TAG_TEXT),
}
}
}
impl IntoBitmap for &Keyword {
fn into_bitmap(self) -> (Vec<u8>, u8) {
match self {
Keyword::Seen => (vec![0u8], BM_TAG | TAG_STATIC),
Keyword::Draft => (vec![1u8], BM_TAG | TAG_STATIC),
Keyword::Flagged => (vec![2u8], BM_TAG | TAG_STATIC),
Keyword::Answered => (vec![3u8], BM_TAG | TAG_STATIC),
Keyword::Recent => (vec![4u8], BM_TAG | TAG_STATIC),
Keyword::Important => (vec![5u8], BM_TAG | TAG_STATIC),
Keyword::Phishing => (vec![6u8], BM_TAG | TAG_STATIC),
Keyword::Junk => (vec![7u8], BM_TAG | TAG_STATIC),
Keyword::NotJunk => (vec![8u8], BM_TAG | TAG_STATIC),
Keyword::Deleted => (vec![9u8], BM_TAG | TAG_STATIC),
Keyword::Forwarded => (vec![10u8], BM_TAG | TAG_STATIC),
Keyword::MdnSent => (vec![11u8], BM_TAG | TAG_STATIC),
Keyword::Other(string) => (string.as_str().serialize(), BM_TAG | TAG_TEXT),
}
}
}

View file

@ -1,5 +1,6 @@
use std::fmt::{Display, Formatter};
use mail_parser::RfcHeader;
use serde::Serialize;
use crate::parser::{json::Parser, Error, JsonObjectParser};
@ -805,3 +806,126 @@ impl Display for HeaderForm {
}
}
}
impl From<Property> for u8 {
fn from(value: Property) -> Self {
match value {
Property::IsActive => 0,
Property::IsEnabled => 1,
Property::IsSubscribed => 2,
Property::Keys => 3,
Property::Keywords => 4,
Property::Language => 5,
Property::Location => 6,
Property::MailboxIds => 7,
Property::MayDelete => 8,
Property::MdnBlobIds => 9,
Property::Members => 10,
Property::MessageId => 11,
Property::MyRights => 12,
Property::Name => 13,
Property::ParentId => 14,
Property::PartId => 15,
Property::Picture => 16,
Property::Preview => 17,
Property::Quota => 18,
Property::ReceivedAt => 19,
Property::References => 20,
Property::ReplyTo => 21,
Property::Role => 22,
Property::Secret => 23,
Property::SendAt => 24,
Property::Sender => 25,
Property::SentAt => 26,
Property::Size => 27,
Property::SortOrder => 28,
Property::Subject => 29,
Property::SubParts => 30,
Property::TextBody => 31,
Property::TextSignature => 32,
Property::ThreadId => 33,
Property::Timezone => 34,
Property::To => 35,
Property::ToDate => 36,
Property::TotalEmails => 37,
Property::TotalThreads => 38,
Property::Type => 39,
Property::Types => 40,
Property::UndoStatus => 41,
Property::UnreadEmails => 42,
Property::UnreadThreads => 43,
Property::Url => 44,
Property::VerificationCode => 45,
Property::Parameters => 46,
Property::Addresses => 47,
Property::P256dh => 48,
Property::Auth => 49,
Property::Value => 50,
Property::SmtpReply => 51,
Property::Delivered => 52,
Property::Displayed => 53,
Property::MailFrom => 54,
Property::RcptTo => 55,
Property::IsEncodingProblem => 56,
Property::IsTruncated => 57,
Property::MayReadItems => 58,
Property::MayAddItems => 59,
Property::MayRemoveItems => 60,
Property::MaySetSeen => 61,
Property::MaySetKeywords => 62,
Property::MayCreateChild => 63,
Property::MayRename => 64,
Property::MaySubmit => 65,
Property::Acl => 66,
Property::Aliases => 67,
Property::Attachments => 68,
Property::Bcc => 69,
Property::BlobId => 70,
Property::BodyStructure => 71,
Property::BodyValues => 72,
Property::Capabilities => 73,
Property::Cc => 74,
Property::Charset => 75,
Property::Cid => 76,
Property::DeliveryStatus => 77,
Property::Description => 78,
Property::DeviceClientId => 79,
Property::Disposition => 80,
Property::DsnBlobIds => 81,
Property::Email => 82,
Property::EmailId => 83,
Property::EmailIds => 84,
Property::Envelope => 85,
Property::Expires => 86,
Property::From => 87,
Property::FromDate => 88,
Property::HasAttachment => 89,
Property::Header(_) => 90,
Property::Headers => 91,
Property::HtmlBody => 92,
Property::HtmlSignature => 93,
Property::Id => 94,
Property::IdentityId => 95,
Property::InReplyTo => 96,
Property::_T(_) => 97,
}
}
}
impl From<RfcHeader> for Property {
fn from(value: RfcHeader) -> Self {
match value {
RfcHeader::Subject => Property::Subject,
RfcHeader::From => Property::From,
RfcHeader::To => Property::To,
RfcHeader::Cc => Property::Cc,
RfcHeader::Date => Property::SentAt,
RfcHeader::Bcc => Property::Bcc,
RfcHeader::ReplyTo => Property::ReplyTo,
RfcHeader::Sender => Property::Sender,
RfcHeader::InReplyTo => Property::InReplyTo,
RfcHeader::MessageId => Property::MessageId,
_ => unreachable!(),
}
}
}

View file

@ -1,9 +1,10 @@
use std::fmt::Display;
use std::{borrow::Cow, fmt::Display};
use mail_parser::{Addr, DateTime, Group};
use serde::Serialize;
use utils::map::vec_map::VecMap;
use crate::{
object::Object,
parser::{json::Parser, Ignore, JsonObjectParser, Token},
request::reference::ResultReference,
};
@ -30,7 +31,7 @@ pub enum Value {
TypeState(TypeState),
Acl(Acl),
List(Vec<Value>),
Object(VecMap<Property, Value>),
Object(Object<Value>),
Null,
}
@ -57,7 +58,7 @@ impl Value {
Ok(match parser.next_token::<V>()? {
Token::String(v) => v.into_value(),
Token::DictStart => {
let mut properties = VecMap::with_capacity(4);
let mut properties = Object::with_capacity(4);
while {
let property = parser.next_dict_key::<K>()?.into_property();
let value = Value::from_property(parser, &property)?;
@ -204,3 +205,102 @@ impl IntoValue for TypeState {
Value::TypeState(self)
}
}
impl From<usize> for Value {
fn from(value: usize) -> Self {
Value::UnsignedInt(value as u64)
}
}
impl From<u64> for Value {
fn from(value: u64) -> Self {
Value::UnsignedInt(value)
}
}
impl From<u32> for Value {
fn from(value: u32) -> Self {
Value::UnsignedInt(value as u64)
}
}
impl From<String> for Value {
fn from(value: String) -> Self {
Value::Text(value)
}
}
impl From<bool> for Value {
fn from(value: bool) -> Self {
Value::Bool(value)
}
}
impl From<Keyword> for Value {
fn from(value: Keyword) -> Self {
Value::Keyword(value)
}
}
impl From<Object<Value>> for Value {
fn from(value: Object<Value>) -> Self {
Value::Object(value)
}
}
impl From<DateTime> for Value {
fn from(date: DateTime) -> Self {
Value::Date(UTCDate {
year: date.year,
month: date.month,
day: date.day,
hour: date.hour,
minute: date.minute,
second: date.second,
tz_before_gmt: date.tz_before_gmt,
tz_hour: date.tz_hour,
tz_minute: date.tz_minute,
})
}
}
impl From<Cow<'_, str>> for Value {
fn from(value: Cow<'_, str>) -> Self {
Value::Text(value.into_owned())
}
}
impl<T: Into<Value>> From<Vec<T>> for Value {
fn from(value: Vec<T>) -> Self {
Value::List(value.into_iter().map(|v| v.into()).collect())
}
}
impl<T: Into<Value>> From<Option<T>> for Value {
fn from(value: Option<T>) -> Self {
match value {
Some(value) => value.into(),
None => Value::Null,
}
}
}
impl From<Addr<'_>> for Value {
fn from(value: Addr<'_>) -> Self {
Value::Object(
Object::with_capacity(2)
.with_property(Property::Name, value.name)
.with_property(Property::Email, value.address.unwrap_or_default()),
)
}
}
impl From<Group<'_>> for Value {
fn from(group: Group<'_>) -> Self {
Value::Object(
Object::with_capacity(2)
.with_property(Property::Name, group.name)
.with_property(Property::Addresses, group.addresses),
)
}
}

View file

@ -8,7 +8,7 @@ utils = { path = "/home/vagrant/code/utils" }
rocksdb = { version = "0.20.1", optional = true }
foundationdb = { version = "0.7.0", optional = true }
rusqlite = { version = "0.29.0", features = ["bundled"], optional = true }
tokio = { version = "1.23", features = ["sync"], optional = true }
tokio = { version = "1.23", features = ["sync", "fs", "io-util"], optional = true }
r2d2 = { version = "0.8.10", optional = true }
futures = { version = "0.3", optional = true }
rand = "0.8.5"
@ -37,4 +37,4 @@ sqlite = ["rusqlite", "rayon", "r2d2", "tokio", "is_sync"]
foundation = ["foundationdb", "futures", "is_async"]
is_sync = ["maybe-async/is_sync", "parking_lot", "lru-cache"]
is_async = []
test_mode = []

View file

@ -22,9 +22,9 @@ use super::{
SUBSPACE_VALUES,
};
#[cfg(test)]
#[cfg(feature = "test_mode")]
const ID_ASSIGNMENT_EXPIRY: u64 = 2; // seconds
#[cfg(not(test))]
#[cfg(not(feature = "test_mode"))]
pub const ID_ASSIGNMENT_EXPIRY: u64 = 60 * 60; // seconds
const MAX_COMMIT_ATTEMPTS: u8 = 10;

View file

@ -4,14 +4,15 @@ use ahash::AHashSet;
use crate::{
write::{BatchBuilder, IntoOperations, Operation},
Serialize, BLOOM_BIGRAM, BLOOM_TRIGRAM, BM_HASH, HASH_EXACT, HASH_STEMMED,
Serialize, BLOOM_BIGRAM, BLOOM_TRIGRAM, HASH_EXACT, HASH_STEMMED,
};
use super::{
bloom::{hash_token, BloomFilter},
bloom::BloomFilter,
lang::{LanguageDetector, MIN_LANGUAGE_SCORE},
ngram::ToNgrams,
stemmer::Stemmer,
tokenizers::space::SpaceTokenizer,
Language,
};
@ -26,6 +27,7 @@ struct Text<'x> {
pub struct FtsIndexBuilder<'x> {
parts: Vec<Text<'x>>,
tokens: AHashSet<(u8, String)>,
detect: LanguageDetector,
default_language: Language,
}
@ -35,6 +37,7 @@ impl<'x> FtsIndexBuilder<'x> {
FtsIndexBuilder {
parts: vec![],
detect: LanguageDetector::new(),
tokens: AHashSet::new(),
default_language,
}
}
@ -55,6 +58,13 @@ impl<'x> FtsIndexBuilder<'x> {
language,
});
}
pub fn index_raw(&mut self, field: impl Into<u8>, text: &str) {
let field = field.into();
for token in SpaceTokenizer::new(text, MAX_TOKEN_LENGTH) {
self.tokens.insert((field, token));
}
}
}
impl<'x> IntoOperations for FtsIndexBuilder<'x> {
@ -82,12 +92,9 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
}
for (word, family) in unique_words {
batch.ops.push(Operation::Bitmap {
family: BM_HASH | family | (word.len() & MAX_TOKEN_MASK) as u8,
field: part.field,
key: hash_token(&word),
set: true,
});
batch
.ops
.push(Operation::hash(&word, family, part.field, true));
}
if phrase_words.len() > 1 {
@ -106,6 +113,12 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
}
}
for (field, token) in self.tokens {
batch
.ops
.push(Operation::hash(&token, HASH_EXACT, field, true));
}
Ok(())
}
}

View file

@ -21,7 +21,7 @@
* for more details.
*/
use crate::{BitmapKey, BM_HASH};
use crate::{write::Operation, BitmapKey, BM_HASH};
use self::{bloom::hash_token, builder::MAX_TOKEN_MASK};
@ -179,3 +179,14 @@ impl BitmapKey<Vec<u8>> {
}
}
}
impl Operation {
pub fn hash(word: &str, family: u8, field: u8, set: bool) -> Self {
Operation::Bitmap {
family: BM_HASH | family | (word.len() & MAX_TOKEN_MASK) as u8,
field,
key: hash_token(word),
set,
}
}
}

View file

@ -24,6 +24,7 @@
pub mod chinese;
pub mod indo_european;
pub mod japanese;
pub mod space;
pub mod word;
use std::borrow::Cow;

View file

@ -0,0 +1,51 @@
use std::str::Chars;
pub struct SpaceTokenizer<'x> {
iterator: Chars<'x>,
token: String,
max_token_length: usize,
}
impl SpaceTokenizer<'_> {
pub fn new(text: &str, max_token_length: usize) -> SpaceTokenizer {
SpaceTokenizer {
iterator: text.chars(),
token: String::new(),
max_token_length,
}
}
}
impl Iterator for SpaceTokenizer<'_> {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
for ch in self.iterator.by_ref() {
if ch.is_alphanumeric() {
if ch.is_uppercase() {
for ch in ch.to_lowercase() {
self.token.push(ch);
}
} else {
self.token.push(ch);
}
} else if !self.token.is_empty() {
if self.token.len() < self.max_token_length {
return Some(std::mem::take(&mut self.token));
} else {
self.token.clear();
}
}
}
if !self.token.is_empty() {
if self.token.len() < self.max_token_length {
return Some(std::mem::take(&mut self.token));
} else {
self.token.clear();
}
}
None
}
}

View file

@ -1,8 +1,12 @@
use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};
use ahash::HashSet;
use roaring::RoaringBitmap;
use crate::{write::Tokenize, BitmapKey, ReadTransaction, Store, BM_KEYWORD};
use crate::{
fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
BitmapKey, ReadTransaction, Store, BM_KEYWORD,
};
use super::{Filter, ResultSet};
@ -53,17 +57,10 @@ impl ReadTransaction<'_> {
}
Filter::HasKeywords { field, value } => {
self.get_bitmaps_intersection(
value
.tokenize()
SpaceTokenizer::new(&value, MAX_TOKEN_LENGTH)
.collect::<HashSet<String>>()
.into_iter()
.map(|key| BitmapKey {
account_id,
collection,
family: BM_KEYWORD,
field,
key: key.into_bytes(),
block_num: 0,
})
.map(|word| BitmapKey::hash(&word, account_id, collection, 0, field))
.collect(),
)
.await?

View file

@ -1,4 +1,4 @@
use crate::{BM_DOCUMENT_IDS, BM_KEYWORD};
use crate::BM_DOCUMENT_IDS;
use super::{
Batch, BatchBuilder, HasFlag, IntoBitmap, IntoOperations, Operation, Serialize, Tokenize,
@ -68,14 +68,7 @@ impl BatchBuilder {
let is_set = !options.has_flag(F_CLEAR);
if options.has_flag(F_TOKENIZE) {
for token in value.tokenize() {
self.ops.push(Operation::Bitmap {
family: BM_KEYWORD,
field,
key: token.into_bytes(),
set: is_set,
});
}
value.tokenize(&mut self.ops, field, is_set);
}
let value = value.serialize();

View file

@ -1,6 +1,9 @@
use std::{collections::HashSet, time::SystemTime};
use crate::{Deserialize, Serialize};
use crate::{
fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC, TAG_TEXT,
};
pub mod batch;
pub mod key;
@ -130,55 +133,43 @@ impl HasFlag for u32 {
}
pub trait Tokenize {
fn tokenize(&self) -> HashSet<String>;
fn tokenize(&self, ops: &mut Vec<Operation>, field: u8, set: bool);
}
impl Tokenize for &str {
fn tokenize(&self) -> HashSet<String> {
fn tokenize(&self, ops: &mut Vec<Operation>, field: u8, set: bool) {
let mut tokens = HashSet::new();
let mut token = String::new();
for ch in self.chars() {
if ch.is_alphanumeric() {
if ch.is_uppercase() {
token.push(ch.to_lowercase().next().unwrap());
} else {
token.push(ch);
}
} else if !token.is_empty() {
tokens.insert(token);
token = String::new();
}
}
if !token.is_empty() {
for token in SpaceTokenizer::new(self, MAX_TOKEN_LENGTH) {
tokens.insert(token);
}
tokens
for token in tokens {
ops.push(Operation::hash(&token, HASH_EXACT, field, set));
}
}
}
impl Tokenize for String {
fn tokenize(&self) -> HashSet<String> {
self.as_str().tokenize()
fn tokenize(&self, ops: &mut Vec<Operation>, field: u8, set: bool) {
self.as_str().tokenize(ops, field, set)
}
}
impl Tokenize for u32 {
fn tokenize(&self) -> HashSet<String> {
fn tokenize(&self, _ops: &mut Vec<Operation>, _field: u8, _set: bool) {
unreachable!()
}
}
impl Tokenize for u64 {
fn tokenize(&self) -> HashSet<String> {
fn tokenize(&self, _ops: &mut Vec<Operation>, _field: u8, _set: bool) {
unreachable!()
}
}
impl Tokenize for f64 {
fn tokenize(&self) -> HashSet<String> {
fn tokenize(&self, _ops: &mut Vec<Operation>, _field: u8, _set: bool) {
unreachable!()
}
}
@ -187,6 +178,30 @@ pub trait IntoBitmap {
fn into_bitmap(self) -> (Vec<u8>, u8);
}
impl IntoBitmap for () {
fn into_bitmap(self) -> (Vec<u8>, u8) {
(vec![], BM_TAG | TAG_STATIC)
}
}
impl IntoBitmap for u32 {
fn into_bitmap(self) -> (Vec<u8>, u8) {
(self.serialize(), BM_TAG | TAG_ID)
}
}
impl IntoBitmap for String {
fn into_bitmap(self) -> (Vec<u8>, u8) {
(self.serialize(), BM_TAG | TAG_TEXT)
}
}
impl IntoBitmap for &str {
fn into_bitmap(self) -> (Vec<u8>, u8) {
(self.serialize(), BM_TAG | TAG_TEXT)
}
}
pub trait IntoOperations {
fn build(self, batch: &mut BatchBuilder) -> crate::Result<()>;
}