mail-server/crates/jmap/src/email/index.rs
2023-04-21 15:40:11 +00:00

553 lines
22 KiB
Rust

use std::borrow::Cow;
use jmap_proto::{
object::Object,
types::{
date::UTCDate,
keyword::Keyword,
property::{HeaderForm, Property},
value::Value,
},
};
use mail_parser::{
decoders::html::html_to_text,
parsers::{fields::thread::thread_name, preview::preview_text},
Addr, GetHeader, Group, HeaderName, HeaderValue, Message, MessagePart, PartType, RfcHeader,
};
use store::{
fts::{
builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
Language,
},
write::{BatchBuilder, F_BITMAP, F_INDEX, F_VALUE},
};
use crate::email::headers::IntoForm;
pub const MAX_MESSAGE_PARTS: usize = 1000;
pub const MAX_ID_LENGTH: usize = 100;
pub const MAX_SORT_FIELD_LENGTH: usize = 255;
pub const MAX_STORED_FIELD_LENGTH: usize = 512;
pub const PREVIEW_LENGTH: usize = 256;
pub(super) trait IndexMessage {
fn index_message(
&mut self,
message: Message,
keywords: Vec<Keyword>,
mailbox_ids: Vec<u32>,
received_at: u64,
default_language: Language,
) -> store::Result<()>;
}
impl IndexMessage for BatchBuilder {
fn index_message(
&mut self,
message: Message,
keywords: Vec<Keyword>,
mailbox_ids: Vec<u32>,
received_at: u64,
default_language: Language,
) -> store::Result<()> {
let mut object = Object::with_capacity(15);
// Index keywords
self.value(Property::Keywords, keywords, F_VALUE | F_BITMAP);
// Index mailboxIds
self.value(Property::MailboxIds, mailbox_ids, F_VALUE | F_BITMAP);
// Index size
object.append(Property::Size, message.raw_message.len());
self.value(Property::Size, message.raw_message.len() as u32, F_INDEX);
// Index receivedAt
object.append(
Property::ReceivedAt,
Value::Date(UTCDate::from_timestamp(received_at as i64)),
);
self.value(Property::ReceivedAt, received_at, F_INDEX);
let mut fts = FtsIndexBuilder::with_default_language(default_language);
let mut seen_headers = [false; 40];
let mut language = Language::Unknown;
let mut has_attachments = false;
let preview_part_id = message
.text_body
.first()
.or_else(|| message.html_body.first())
.copied()
.unwrap_or(usize::MAX);
for (part_id, part) in message
.parts
.into_iter()
.take(MAX_MESSAGE_PARTS)
.enumerate()
{
let part_language = part.language().unwrap_or(language);
if part_id == 0 {
language = part_language;
for header in part.headers.into_iter().rev() {
if let HeaderName::Rfc(rfc_header) = header.name {
// Index hasHeader property
let header_num = (rfc_header as u8).to_string();
fts.index_raw_token(Property::Headers, &header_num);
match rfc_header {
RfcHeader::MessageId
| RfcHeader::InReplyTo
| RfcHeader::References
| RfcHeader::ResentMessageId => {
header.value.visit_text(|id| {
// Add ids to inverted index
if id.len() < MAX_ID_LENGTH {
println!("indexing {}: {}", rfc_header.as_str(), id);
self.value(Property::MessageId, id, F_INDEX);
}
// Index ids without stemming
if id.len() < MAX_TOKEN_LENGTH {
fts.index_raw_token(
Property::Headers,
format!("{header_num}{id}"),
);
}
});
if matches!(
rfc_header,
RfcHeader::MessageId
| RfcHeader::InReplyTo
| RfcHeader::References
) && !seen_headers[rfc_header as usize]
{
object.append(
rfc_header.into(),
header
.value
.trim_text(MAX_STORED_FIELD_LENGTH)
.into_form(&HeaderForm::MessageIds),
);
seen_headers[rfc_header as usize] = true;
}
}
RfcHeader::From
| RfcHeader::To
| RfcHeader::Cc
| RfcHeader::Bcc
| RfcHeader::ReplyTo
| RfcHeader::Sender => {
let property = Property::from(rfc_header);
let seen_header = seen_headers[rfc_header as usize];
if matches!(
rfc_header,
RfcHeader::From
| RfcHeader::To
| RfcHeader::Cc
| RfcHeader::Bcc
) {
let mut sort_text =
String::with_capacity(MAX_SORT_FIELD_LENGTH);
let mut found_addr = seen_header;
let mut last_is_space = true;
header.value.visit_addresses(|value, is_addr| {
if !found_addr {
if !sort_text.is_empty() {
sort_text.push(' ');
last_is_space = true;
}
found_addr = is_addr;
'outer: for ch in value.chars() {
for ch in ch.to_lowercase() {
if sort_text.len() < MAX_SORT_FIELD_LENGTH {
let is_space = ch.is_whitespace();
if !is_space || !last_is_space {
sort_text.push(ch);
last_is_space = is_space;
}
} else {
found_addr = true;
break 'outer;
}
}
}
}
// Index an address name or email without stemming
fts.index_raw(u8::from(&property), value);
});
if !seen_header {
// Add address to inverted index
self.value(
u8::from(&property),
if !sort_text.is_empty() {
&sort_text
} else {
"!"
},
F_INDEX,
);
}
}
if !seen_header {
// Add address to object
object.append(
property,
header
.value
.trim_text(MAX_STORED_FIELD_LENGTH)
.into_form(&HeaderForm::Addresses),
);
seen_headers[rfc_header as usize] = true;
}
}
RfcHeader::Date => {
if !seen_headers[rfc_header as usize] {
if let HeaderValue::DateTime(datetime) = &header.value {
self.value(
Property::SentAt,
datetime.to_timestamp() as u64,
F_INDEX,
);
}
object.append(
Property::SentAt,
header.value.into_form(&HeaderForm::Date),
);
seen_headers[rfc_header as usize] = true;
}
}
RfcHeader::Subject => {
// Index subject
let subject = match &header.value {
HeaderValue::Text(text) => text.clone(),
HeaderValue::TextList(list) if !list.is_empty() => {
list.first().unwrap().clone()
}
_ => "".into(),
};
if !seen_headers[rfc_header as usize] {
// Add to object
object.append(
Property::Subject,
header
.value
.trim_text(MAX_STORED_FIELD_LENGTH)
.into_form(&HeaderForm::Text),
);
// Index thread name
let thread_name = thread_name(&subject);
self.value(
Property::Subject,
if !thread_name.is_empty() {
thread_name.trim_text(MAX_SORT_FIELD_LENGTH)
} else {
"!"
},
F_INDEX,
);
seen_headers[rfc_header as usize] = true;
}
// Index subject for FTS
fts.index(Property::Subject, subject, language);
}
RfcHeader::Comments | RfcHeader::Keywords | RfcHeader::ListId => {
// Index headers
header.value.visit_text(|text| {
for token in text.split_ascii_whitespace() {
if token.len() < MAX_TOKEN_LENGTH {
fts.index_raw_token(
Property::Headers,
format!("{header_num}{}", token.to_lowercase()),
);
}
}
});
}
_ => (),
}
}
}
}
match part.body {
PartType::Text(text) => {
if part_id == preview_part_id {
object.append(
Property::Preview,
preview_text(text.clone(), PREVIEW_LENGTH),
);
}
if message.text_body.contains(&part_id) || message.html_body.contains(&part_id)
{
fts.index(Property::TextBody, text, part_language);
} else {
fts.index(Property::Attachments, text, part_language);
has_attachments = true;
}
}
PartType::Html(html) => {
let text = html_to_text(&html);
if part_id == preview_part_id {
object.append(
Property::Preview,
preview_text(text.clone().into(), PREVIEW_LENGTH),
);
}
if message.text_body.contains(&part_id) || message.html_body.contains(&part_id)
{
fts.index(Property::TextBody, text, part_language);
} else {
fts.index(Property::Attachments, text, part_language);
has_attachments = true;
}
}
PartType::Binary(_) if !has_attachments => {
has_attachments = true;
}
PartType::Message(mut nested_message) => {
let nested_message_language = nested_message
.root_part()
.language()
.unwrap_or(Language::Unknown);
if let Some(HeaderValue::Text(subject)) =
nested_message.remove_header_rfc(RfcHeader::Subject)
{
fts.index(
Property::Attachments,
subject.into_owned(),
nested_message_language,
);
}
for sub_part in nested_message.parts.into_iter().take(MAX_MESSAGE_PARTS) {
let language = sub_part.language().unwrap_or(nested_message_language);
match sub_part.body {
PartType::Text(text) => {
fts.index(Property::Attachments, text, language);
}
PartType::Html(html) => {
fts.index(Property::Attachments, html_to_text(&html), language);
}
_ => (),
}
}
if !has_attachments {
has_attachments = true;
}
}
_ => {}
}
}
// Store and index hasAttachment property
object.append(Property::HasAttachment, has_attachments);
if has_attachments {
self.bitmap(Property::HasAttachment, (), 0);
}
// Store properties
self.value(Property::BodyStructure, object, F_VALUE);
// Store full text index
self.custom(fts)?;
Ok(())
}
}
trait GetContentLanguage {
fn language(&self) -> Option<Language>;
}
impl GetContentLanguage for MessagePart<'_> {
fn language(&self) -> Option<Language> {
self.headers.rfc(&RfcHeader::ContentLanguage).and_then(|v| {
Language::from_iso_639(match v {
HeaderValue::Text(v) => v.as_ref(),
HeaderValue::TextList(v) => v.first()?,
_ => {
return None;
}
})
.unwrap_or(Language::Unknown)
.into()
})
}
}
trait VisitValues {
fn visit_addresses(&self, visitor: impl FnMut(&str, bool));
fn visit_text(&self, visitor: impl FnMut(&str));
}
impl VisitValues for HeaderValue<'_> {
fn visit_addresses(&self, mut visitor: impl FnMut(&str, bool)) {
match self {
HeaderValue::Address(addr) => {
if let Some(name) = &addr.name {
visitor(name.as_ref(), false);
}
if let Some(addr) = &addr.address {
visitor(addr.as_ref(), true);
}
}
HeaderValue::AddressList(addr_list) => {
for addr in addr_list {
if let Some(name) = &addr.name {
visitor(name.as_ref(), false);
}
if let Some(addr) = &addr.address {
visitor(addr.as_ref(), true);
}
}
}
HeaderValue::Group(group) => {
if let Some(name) = &group.name {
visitor(name.as_ref(), false);
}
for addr in &group.addresses {
if let Some(name) = &addr.name {
visitor(name.as_ref(), false);
}
if let Some(addr) = &addr.address {
visitor(addr.as_ref(), true);
}
}
}
HeaderValue::GroupList(groups) => {
for group in groups {
if let Some(name) = &group.name {
visitor(name.as_ref(), false);
}
for addr in &group.addresses {
if let Some(name) = &addr.name {
visitor(name.as_ref(), false);
}
if let Some(addr) = &addr.address {
visitor(addr.as_ref(), true);
}
}
}
}
_ => (),
}
}
fn visit_text(&self, mut visitor: impl FnMut(&str)) {
match &self {
HeaderValue::Text(text) => {
visitor(text.as_ref());
}
HeaderValue::TextList(texts) => {
for text in texts {
visitor(text.as_ref());
}
}
_ => (),
}
}
}
pub trait TrimTextValue {
fn trim_text(self, length: usize) -> Self;
}
impl TrimTextValue for HeaderValue<'_> {
fn trim_text(self, length: usize) -> Self {
match self {
HeaderValue::Address(v) => HeaderValue::Address(v.trim_text(length)),
HeaderValue::AddressList(v) => HeaderValue::AddressList(v.trim_text(length)),
HeaderValue::Group(v) => HeaderValue::Group(v.trim_text(length)),
HeaderValue::GroupList(v) => HeaderValue::GroupList(v.trim_text(length)),
HeaderValue::Text(v) => HeaderValue::Text(v.trim_text(length)),
HeaderValue::TextList(v) => HeaderValue::TextList(v.trim_text(length)),
v => v,
}
}
}
impl TrimTextValue for Addr<'_> {
fn trim_text(self, length: usize) -> Self {
Self {
name: self.name.map(|v| v.trim_text(length)),
address: self.address.map(|v| v.trim_text(length)),
}
}
}
impl TrimTextValue for Group<'_> {
fn trim_text(self, length: usize) -> Self {
Self {
name: self.name.map(|v| v.trim_text(length)),
addresses: self.addresses.trim_text(length),
}
}
}
impl TrimTextValue for Cow<'_, str> {
fn trim_text(self, length: usize) -> Self {
if self.len() < length {
self
} else {
match self {
Cow::Borrowed(v) => v.trim_text(length).into(),
Cow::Owned(v) => v.trim_text(length).into(),
}
}
}
}
impl TrimTextValue for &str {
fn trim_text(self, length: usize) -> Self {
if self.len() < length {
self
} else {
let mut index = 0;
for (i, _) in self.char_indices() {
if i > length {
break;
}
index = i;
}
&self[..index]
}
}
}
impl TrimTextValue for String {
fn trim_text(self, length: usize) -> Self {
if self.len() < length {
self
} else {
let mut result = String::with_capacity(length);
for (i, c) in self.char_indices() {
if i > length {
break;
}
result.push(c);
}
result
}
}
}
impl<T: TrimTextValue> TrimTextValue for Vec<T> {
fn trim_text(self, length: usize) -> Self {
self.into_iter().map(|v| v.trim_text(length)).collect()
}
}