Bayes classifier

This commit is contained in:
mdecimus 2023-10-11 19:21:11 +02:00
parent 3d9efd363a
commit ace58f74eb
41 changed files with 6737 additions and 934 deletions

64
Cargo.lock generated
View file

@ -93,9 +93,9 @@ dependencies = [
[[package]]
name = "aho-corasick"
version = "1.1.1"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
@ -2676,15 +2676,6 @@ version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linkify"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1dfa36d52c581e9ec783a7ce2a5e0143da6237be5811a0b3153fedfdbe9f780"
dependencies = [
"memchr",
]
[[package]]
name = "linux-raw-sys"
version = "0.4.10"
@ -2994,11 +2985,16 @@ dependencies = [
"farmhash",
"jieba-rs",
"lazy_static",
"lru-cache",
"nohash",
"parking_lot",
"phf",
"rust-stemmers",
"serde",
"siphasher 1.0.0",
"tinysegmenter",
"tokio",
"utils",
"whatlang",
"xxhash-rust",
]
@ -3294,9 +3290,9 @@ dependencies = [
[[package]]
name = "ordered-float"
version = "3.9.1"
version = "3.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a54938017eacd63036332b4ae5c8a49fc8c0c1d6d629893057e4f13609edd06"
checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
dependencies = [
"num-traits",
]
@ -3630,9 +3626,9 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.68"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c"
checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
dependencies = [
"unicode-ident",
]
@ -3799,9 +3795,9 @@ dependencies = [
[[package]]
name = "rasn"
version = "0.10.1"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4addd1a49756bcb131c2f686c6c833d2b63e4da7a0df07efd8c3de04b7efbdb2"
checksum = "c22b7f7ff0508dae62e1be69fe02f32eb88523090b50ac850637947853cf5b6d"
dependencies = [
"arrayvec",
"bitvec",
@ -3821,9 +3817,9 @@ dependencies = [
[[package]]
name = "rasn-cms"
version = "0.10.1"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e269b4df6eea0f54abd46afacd759b1c13a27e98da98a47ef3c405ef3568b0f5"
checksum = "6ecf9f1bb38cbb2a032014f0329d7fd9c2b08f26c4fc882ad642bb95dfefd74f"
dependencies = [
"rasn",
"rasn-pkix",
@ -3831,9 +3827,9 @@ dependencies = [
[[package]]
name = "rasn-derive"
version = "0.10.1"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba8242a16e3461b81333516ad8457906f52fdf21d087417fb59262c9ab406618"
checksum = "a1e6ddbc9ada563036d59c322cb0886a9b08b346904eebbcd20af2e01caecee7"
dependencies = [
"either",
"itertools 0.10.5",
@ -3846,9 +3842,9 @@ dependencies = [
[[package]]
name = "rasn-pkix"
version = "0.10.1"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06179c947a63fe9f9f5d73a539dcb13d90c6bdaeb03bd28b90ad796aff9fe6a8"
checksum = "b894c903130c4915d79d8d9ce155429b3896b25efa5f81de4d9ab7b1b0f0b7cf"
dependencies = [
"rasn",
]
@ -3904,14 +3900,14 @@ dependencies = [
[[package]]
name = "regex"
version = "1.9.6"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff"
checksum = "d119d7c7ca818f8a53c300863d4f87566aac09943aef5b355bb83969dae75d87"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata 0.3.9",
"regex-syntax 0.7.5",
"regex-automata 0.4.1",
"regex-syntax 0.8.0",
]
[[package]]
@ -3925,13 +3921,13 @@ dependencies = [
[[package]]
name = "regex-automata"
version = "0.3.9"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax 0.7.5",
"regex-syntax 0.8.0",
]
[[package]]
@ -3942,9 +3938,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
[[package]]
name = "regex-syntax"
version = "0.7.5"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
checksum = "c3cbb081b9784b07cceb8824c8583f86db4814d172ab043f3c23f7dc600bf83d"
[[package]]
name = "reqwest"
@ -4610,7 +4606,7 @@ checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380"
[[package]]
name = "sieve-rs"
version = "0.3.1"
source = "git+https://github.com/stalwartlabs/sieve#c9288b62815610872e9f278b904e34d46124acb5"
source = "git+https://github.com/stalwartlabs/sieve#bbb265765ebe92394e429001e90ba2e9b4201f9a"
dependencies = [
"ahash 0.8.3",
"bincode",
@ -4690,13 +4686,13 @@ dependencies = [
"imagesize",
"infer",
"lazy_static",
"linkify",
"lru-cache",
"mail-auth",
"mail-builder",
"mail-parser",
"mail-send",
"md5",
"nlp",
"num_cpus",
"parking_lot",
"rand 0.8.5",

View file

@ -23,7 +23,7 @@
use mail_send::Credentials;
use crate::{Directory, Principal, QueryColumn};
use crate::{DatabaseColumn, Directory, Principal};
use super::CachedDirectory;
@ -71,11 +71,15 @@ impl<T: Directory> Directory for CachedDirectory<T> {
self.inner.expn(address).await
}
async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result<bool> {
async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
self.inner.lookup(query, params).await
}
async fn query(&self, query: &str, params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
async fn query(
&self,
query: &str,
params: &[DatabaseColumn<'_>],
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
self.inner.query(query, params).await
}

View file

@ -24,7 +24,7 @@
use mail_send::Credentials;
use smtp_proto::{AUTH_CRAM_MD5, AUTH_LOGIN, AUTH_OAUTHBEARER, AUTH_PLAIN, AUTH_XOAUTH2};
use crate::{Directory, DirectoryError, Principal, QueryColumn};
use crate::{DatabaseColumn, Directory, DirectoryError, Principal};
use super::{ImapDirectory, ImapError};
@ -98,11 +98,15 @@ impl Directory for ImapDirectory {
Err(DirectoryError::unsupported("imap", "expn"))
}
async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result<bool> {
async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
Err(DirectoryError::unsupported("imap", "lookup"))
}
async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
async fn query(
&self,
_: &str,
_: &[DatabaseColumn<'_>],
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
Err(DirectoryError::unsupported("imap", "query"))
}

View file

@ -24,7 +24,7 @@
use ldap3::{ResultEntry, Scope, SearchEntry};
use mail_send::Credentials;
use crate::{Directory, Principal, QueryColumn, Type};
use crate::{DatabaseColumn, Directory, Principal, Type};
use super::{LdapDirectory, LdapMappings};
@ -239,13 +239,17 @@ impl Directory for LdapDirectory {
Ok(emails)
}
async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result<bool> {
async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
self.query_(query, params)
.await
.map(|entry| entry.is_some())
}
async fn query(&self, query: &str, params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
async fn query(
&self,
query: &str,
params: &[DatabaseColumn<'_>],
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
self.query_(query, params).await.map(|entry| {
if let Some(entry) = entry {
let mut object = String::new();
@ -257,7 +261,7 @@ impl Directory for LdapDirectory {
object.push('\n');
}
}
vec![QueryColumn::Text(object)]
vec![DatabaseColumn::Text(object.into())]
} else {
vec![]
}
@ -283,7 +287,11 @@ impl Directory for LdapDirectory {
}
impl LdapDirectory {
async fn query_(&self, query: &str, params: &[&str]) -> crate::Result<Option<ResultEntry>> {
async fn query_(
&self,
query: &str,
params: &[DatabaseColumn<'_>],
) -> crate::Result<Option<ResultEntry>> {
let mut conn = self.pool.get().await?;
tracing::trace!(context = "directory", event = "query", query = query, params = ?params);
@ -292,7 +300,7 @@ impl LdapDirectory {
for (pos, item) in query.split('?').enumerate() {
if pos > 0 {
if let Some(param) = params.get(pos - 1) {
expanded_query.push_str(param);
expanded_query.push_str(param.as_str());
}
}
expanded_query.push_str(item);

View file

@ -21,7 +21,11 @@
* for more details.
*/
use std::{borrow::Cow, fmt::Debug, sync::Arc};
use std::{
borrow::Cow,
fmt::{Debug, Display},
sync::Arc,
};
use ahash::{AHashMap, AHashSet};
use bb8::RunError;
@ -82,8 +86,12 @@ pub trait Directory: Sync + Send {
async fn rcpt(&self, address: &str) -> crate::Result<bool>;
async fn vrfy(&self, address: &str) -> Result<Vec<String>>;
async fn expn(&self, address: &str) -> Result<Vec<String>>;
async fn lookup(&self, query: &str, params: &[&str]) -> Result<bool>;
async fn query(&self, query: &str, params: &[&str]) -> Result<Vec<QueryColumn>>;
async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> Result<bool>;
async fn query(
&self,
query: &str,
params: &[DatabaseColumn<'_>],
) -> Result<Vec<DatabaseColumn<'static>>>;
fn type_name(&self) -> &'static str {
std::any::type_name::<Self>()
@ -91,12 +99,12 @@ pub trait Directory: Sync + Send {
}
#[derive(Clone, Debug)]
pub enum QueryColumn {
pub enum DatabaseColumn<'x> {
Integer(i64),
Bool(bool),
Float(f64),
Text(String),
Blob(Vec<u8>),
Text(Cow<'x, str>),
Blob(Cow<'x, [u8]>),
Null,
}
@ -169,24 +177,24 @@ impl PartialEq for MatchType {
impl Eq for MatchType {}
impl Lookup {
pub async fn contains(&self, item: &str) -> Option<bool> {
pub async fn contains(&self, item: impl Into<DatabaseColumn<'_>>) -> Option<bool> {
match self {
Lookup::Directory { directory, query } => {
match directory.lookup(query, &[item]).await {
match directory.lookup(query, &[item.into()]).await {
Ok(result) => result.into(),
Err(_) => None,
}
}
Lookup::List { list } => list.contains(item).into(),
Lookup::Map { map } => map.contains_key(item).into(),
Lookup::List { list } => list.contains(item.into().as_str()).into(),
Lookup::Map { map } => map.contains_key(item.into().as_str()).into(),
}
}
pub async fn lookup(&self, item: &str) -> Option<Variable<'static>> {
pub async fn lookup(&self, items: &[DatabaseColumn<'_>]) -> Option<Variable<'static>> {
match self {
Lookup::Directory { directory, query } => match directory.query(query, &[item]).await {
Lookup::Directory { directory, query } => match directory.query(query, items).await {
Ok(mut result) => match result.len() {
1 if !matches!(result.first(), Some(QueryColumn::Null)) => {
1 if !matches!(result.first(), Some(DatabaseColumn::Null)) => {
result.pop().map(Variable::from).unwrap()
}
0 => Variable::default(),
@ -195,21 +203,34 @@ impl Lookup {
.into(),
Err(_) => None,
},
Lookup::List { list } => Some(list.contains(item).into()),
Lookup::Map { map } => map.get(item).cloned(),
Lookup::List { list } => Some(list.contains(items[0].as_str()).into()),
Lookup::Map { map } => map.get(items[0].as_str()).cloned(),
}
}
pub async fn query(
&self,
items: &[DatabaseColumn<'_>],
) -> Option<Vec<DatabaseColumn<'static>>> {
match self {
Lookup::Directory { directory, query } => match directory.query(query, items).await {
Ok(result) => Some(result),
Err(_) => None,
},
_ => None,
}
}
}
impl From<QueryColumn> for Variable<'static> {
fn from(value: QueryColumn) -> Self {
impl<'x> From<DatabaseColumn<'x>> for Variable<'static> {
fn from(value: DatabaseColumn) -> Self {
match value {
QueryColumn::Integer(v) => Variable::Integer(v),
QueryColumn::Bool(v) => Variable::Integer(i64::from(v)),
QueryColumn::Float(v) => Variable::Float(v),
QueryColumn::Text(v) => Variable::String(v),
QueryColumn::Blob(v) => Variable::String(v.into_string()),
QueryColumn::Null => Variable::StringRef(""),
DatabaseColumn::Integer(v) => Variable::Integer(v),
DatabaseColumn::Bool(v) => Variable::Integer(i64::from(v)),
DatabaseColumn::Float(v) => Variable::Float(v),
DatabaseColumn::Text(v) => Variable::String(v.into_owned()),
DatabaseColumn::Blob(v) => Variable::String(v.into_owned().into_string()),
DatabaseColumn::Null => Variable::StringRef(""),
}
}
}
@ -457,3 +478,115 @@ impl AddressMapping {
}
}
}
impl<'x> DatabaseColumn<'x> {
pub fn as_str(&self) -> &str {
match self {
Self::Text(v) => v.as_ref(),
_ => "",
}
}
}
impl<'x> From<&'x str> for DatabaseColumn<'x> {
fn from(value: &'x str) -> Self {
Self::Text(value.into())
}
}
impl<'x> From<String> for DatabaseColumn<'x> {
fn from(value: String) -> Self {
Self::Text(value.into())
}
}
impl<'x> From<&'x String> for DatabaseColumn<'x> {
fn from(value: &'x String) -> Self {
Self::Text(value.into())
}
}
impl<'x> From<Cow<'x, str>> for DatabaseColumn<'x> {
fn from(value: Cow<'x, str>) -> Self {
Self::Text(value)
}
}
impl<'x> From<bool> for DatabaseColumn<'x> {
fn from(value: bool) -> Self {
Self::Bool(value)
}
}
impl<'x> From<i64> for DatabaseColumn<'x> {
fn from(value: i64) -> Self {
Self::Integer(value)
}
}
impl<'x> From<u64> for DatabaseColumn<'x> {
fn from(value: u64) -> Self {
Self::Integer(value as i64)
}
}
impl<'x> From<u32> for DatabaseColumn<'x> {
fn from(value: u32) -> Self {
Self::Integer(value as i64)
}
}
impl<'x> From<f64> for DatabaseColumn<'x> {
fn from(value: f64) -> Self {
Self::Float(value)
}
}
impl<'x> From<&'x [u8]> for DatabaseColumn<'x> {
fn from(value: &'x [u8]) -> Self {
Self::Blob(value.into())
}
}
impl<'x> From<Vec<u8>> for DatabaseColumn<'x> {
fn from(value: Vec<u8>) -> Self {
Self::Blob(value.into())
}
}
impl<'x> From<Variable<'x>> for DatabaseColumn<'x> {
fn from(value: Variable<'x>) -> Self {
match value {
Variable::String(v) => Self::Text(v.into()),
Variable::StringRef(v) => Self::Text(v.into()),
Variable::Integer(v) => Self::Integer(v),
Variable::Float(v) => Self::Float(v),
v => Self::Text(v.into_string().into()),
}
}
}
impl<'x> From<&'x Variable<'x>> for DatabaseColumn<'x> {
fn from(value: &'x Variable<'x>) -> Self {
match value {
Variable::String(v) => Self::Text(v.into()),
Variable::StringRef(v) => Self::Text((*v).into()),
Variable::Integer(v) => Self::Integer(*v),
Variable::Float(v) => Self::Float(*v),
v => Self::Text(v.to_string().into()),
}
}
}
impl<'x> Display for DatabaseColumn<'x> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DatabaseColumn::Text(v) => f.write_str(v.as_ref()),
DatabaseColumn::Integer(v) => write!(f, "{}", v),
DatabaseColumn::Bool(v) => write!(f, "{}", v),
DatabaseColumn::Float(v) => write!(f, "{}", v),
DatabaseColumn::Blob(v) => write!(f, "{}", String::from_utf8_lossy(v.as_ref())),
DatabaseColumn::Null => write!(f, "NULL"),
}
}
}

View file

@ -23,7 +23,7 @@
use mail_send::Credentials;
use crate::{Directory, DirectoryError, Principal, QueryColumn};
use crate::{DatabaseColumn, Directory, DirectoryError, Principal};
use super::{EmailType, MemoryDirectory};
@ -132,11 +132,15 @@ impl Directory for MemoryDirectory {
Ok(result)
}
async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result<bool> {
async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
Err(DirectoryError::unsupported("memory", "lookp"))
}
async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
async fn query(
&self,
_: &str,
_: &[DatabaseColumn<'_>],
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
Err(DirectoryError::unsupported("memory", "query"))
}

View file

@ -24,7 +24,7 @@
use mail_send::{smtp::AssertReply, Credentials};
use smtp_proto::Severity;
use crate::{Directory, DirectoryError, Principal, QueryColumn};
use crate::{DatabaseColumn, Directory, DirectoryError, Principal};
use super::{SmtpClient, SmtpDirectory};
@ -93,11 +93,15 @@ impl Directory for SmtpDirectory {
.await
}
async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result<bool> {
async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
Err(DirectoryError::unsupported("smtp", "lookup"))
}
async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
async fn query(
&self,
_: &str,
_: &[DatabaseColumn<'_>],
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
Err(DirectoryError::unsupported("smtp", "query"))
}

View file

@ -25,7 +25,7 @@ use futures::TryStreamExt;
use mail_send::Credentials;
use sqlx::{any::AnyRow, postgres::any::AnyTypeInfoKind, Column, Row};
use crate::{Directory, Principal, QueryColumn, Type};
use crate::{DatabaseColumn, Directory, Principal, Type};
use super::{SqlDirectory, SqlMappings};
@ -154,35 +154,39 @@ impl Directory for SqlDirectory {
.map_err(Into::into)
}
async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result<bool> {
async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
self.query_(query, params).await.map(|row| row.is_some())
}
async fn query(&self, query: &str, params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
async fn query(
&self,
query: &str,
params: &[DatabaseColumn<'_>],
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
self.query_(query, params).await.map(|row| {
if let Some(row) = row {
let mut columns = Vec::with_capacity(row.columns().len());
for col in row.columns() {
let idx = col.ordinal();
columns.push(match col.type_info().kind() {
AnyTypeInfoKind::Null => QueryColumn::Null,
AnyTypeInfoKind::Null => DatabaseColumn::Null,
AnyTypeInfoKind::Bool => {
QueryColumn::Bool(row.try_get(idx).unwrap_or_default())
DatabaseColumn::Bool(row.try_get(idx).unwrap_or_default())
}
AnyTypeInfoKind::SmallInt
| AnyTypeInfoKind::Integer
| AnyTypeInfoKind::BigInt => {
QueryColumn::Integer(row.try_get(idx).unwrap_or_default())
DatabaseColumn::Integer(row.try_get(idx).unwrap_or_default())
}
AnyTypeInfoKind::Real | AnyTypeInfoKind::Double => {
QueryColumn::Float(row.try_get(idx).unwrap_or_default())
}
AnyTypeInfoKind::Text => {
QueryColumn::Text(row.try_get(idx).unwrap_or_default())
}
AnyTypeInfoKind::Blob => {
QueryColumn::Blob(row.try_get(idx).unwrap_or_default())
DatabaseColumn::Float(row.try_get(idx).unwrap_or_default())
}
AnyTypeInfoKind::Text => DatabaseColumn::Text(
row.try_get::<String, _>(idx).unwrap_or_default().into(),
),
AnyTypeInfoKind::Blob => DatabaseColumn::Blob(
row.try_get::<Vec<u8>, _>(idx).unwrap_or_default().into(),
),
});
}
columns
@ -204,11 +208,24 @@ impl Directory for SqlDirectory {
}
impl SqlDirectory {
async fn query_(&self, query: &str, params: &[&str]) -> crate::Result<Option<AnyRow>> {
async fn query_(
&self,
query: &str,
params: &[DatabaseColumn<'_>],
) -> crate::Result<Option<AnyRow>> {
tracing::trace!(context = "directory", event = "query", query = query, params = ?params);
let mut q = sqlx::query(query);
for param in params {
q = q.bind(param);
q = match param {
DatabaseColumn::Text(v) => q.bind(v.as_ref()),
DatabaseColumn::Integer(v) => q.bind(v),
DatabaseColumn::Bool(v) => q.bind(v),
DatabaseColumn::Float(v) => q.bind(v),
DatabaseColumn::Blob(v) => {
q.bind(std::str::from_utf8(v.as_ref()).unwrap_or_default())
}
DatabaseColumn::Null => q.bind(""),
}
}
q.fetch(&self.pool).try_next().await.map_err(Into::into)

View file

@ -37,8 +37,8 @@ p256 = { version = "0.13", features = ["ecdh"] }
hkdf = "0.12.3"
sha2 = "0.10.1"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls-webpki-roots"]}
tokio-tungstenite = "0.20.0"
tungstenite = "0.20.0"
tokio-tungstenite = "0.20"
tungstenite = "0.20"
chrono = "0.4"
dashmap = "5.4"
aes = "0.8.3"

View file

@ -5,6 +5,7 @@ edition = "2021"
resolver = "2"
[dependencies]
utils = { path = "../utils" }
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
farmhash = "1.1.5"
siphasher = "1.0"
@ -17,3 +18,12 @@ whatlang = "0.16" # Language detection
rust-stemmers = "1.2" # Stemmers
tinysegmenter = "0.1" # Japanese tokenizer
jieba-rs = "0.6" # Chinese stemmer
phf = { version = "0.11", features = ["macros"] }
lru-cache = "0.1.2"
parking_lot = "0.12.1"
[features]
test_mode = []
[dev-dependencies]
tokio = { version = "1.23", features = ["full"] }

View file

@ -1,77 +0,0 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use nohash::IsEnabled;
use crate::transformers::osb::{Gram, OsbToken};
use super::TokenHash;
pub struct BloomHasher<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> {
buf: Vec<u8>,
tokens: T,
}
impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> BloomHasher<'x, T> {
pub fn new(tokens: T) -> Self {
Self {
buf: Vec::with_capacity(64),
tokens,
}
}
}
impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> Iterator for BloomHasher<'x, T> {
type Item = OsbToken<TokenHash>;
fn next(&mut self) -> Option<Self::Item> {
self.tokens.next().map(|token| {
let bytes = match token.inner {
Gram::Uni { t1 } => t1.as_bytes(),
Gram::Bi { t1, t2, .. } => {
self.buf.clear();
self.buf.extend_from_slice(t1.as_bytes());
self.buf.push(b' ');
self.buf.extend_from_slice(t2.as_bytes());
&self.buf
}
};
OsbToken {
inner: TokenHash {
h1: xxhash_rust::xxh3::xxh3_64(bytes),
h2: farmhash::hash64(bytes),
},
idx: token.idx,
}
})
}
}
impl std::hash::Hash for TokenHash {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
state.write_u64(self.h1 ^ self.h2);
}
}
impl IsEnabled for TokenHash {}

View file

@ -0,0 +1,107 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::{
hash::BuildHasherDefault,
time::{Duration, Instant},
};
use lru_cache::LruCache;
use nohash::NoHashHasher;
use parking_lot::Mutex;
use super::{TokenHash, Weights};
#[derive(Debug)]
pub struct BayesTokenCache {
positive: Mutex<LruCache<TokenHash, CacheItem, BuildHasherDefault<NoHashHasher<TokenHash>>>>,
negative: Mutex<LruCache<TokenHash, Instant, BuildHasherDefault<NoHashHasher<TokenHash>>>>,
ttl_negative: Duration,
ttl_positive: Duration,
}
#[derive(Debug, Clone)]
pub struct CacheItem {
item: Weights,
valid_until: Instant,
}
impl BayesTokenCache {
pub fn new(capacity: usize, ttl_positive: Duration, ttl_negative: Duration) -> Self {
Self {
positive: Mutex::new(LruCache::with_hasher(capacity, Default::default())),
negative: Mutex::new(LruCache::with_hasher(capacity, Default::default())),
ttl_negative,
ttl_positive,
}
}
pub fn get(&self, hash: &TokenHash) -> Option<Option<Weights>> {
{
let mut pos_cache = self.positive.lock();
if let Some(entry) = pos_cache.get_mut(hash) {
return if entry.valid_until >= Instant::now() {
Some(Some(entry.item))
} else {
pos_cache.remove(hash);
None
};
}
}
{
let mut neg_cache = self.negative.lock();
if let Some(entry) = neg_cache.get_mut(hash) {
return if *entry >= Instant::now() {
Some(None)
} else {
neg_cache.remove(hash);
None
};
}
}
None
}
pub fn insert_positive(&self, hash: TokenHash, weights: Weights) {
self.positive.lock().insert(
hash,
CacheItem {
item: weights,
valid_until: Instant::now() + self.ttl_positive,
},
);
}
pub fn insert_negative(&self, hash: TokenHash) {
self.negative
.lock()
.insert(hash, Instant::now() + self.ttl_negative);
}
pub fn invalidate(&self, hash: &TokenHash) {
if self.positive.lock().remove(hash).is_none() {
self.negative.lock().remove(hash);
}
}
}

View file

@ -21,13 +21,14 @@
* for more details.
*/
use crate::transformers::osb::OsbToken;
use crate::tokenizers::osb::OsbToken;
use super::{BayesClassifier, Weights};
// Position 0 represents Unigram weights
const FEATURE_WEIGHT: [f64; 8] = [1.0, 3125.0, 256.0, 27.0, 1.0, 0.0, 0.0, 0.0];
// Credits: ported from RSpamd
impl BayesClassifier {
pub fn classify<T>(&self, tokens: T, ham_learns: u32, spam_learns: u32) -> Option<f64>
where

View file

@ -26,8 +26,11 @@ use std::{collections::HashMap, hash::BuildHasherDefault};
use nohash::NoHashHasher;
use serde::{Deserialize, Serialize};
pub mod bloom;
use crate::tokenizers::osb::Gram;
pub mod cache;
pub mod classify;
pub mod tokenize;
pub mod train;
#[derive(Debug, Serialize, Deserialize, Default)]
@ -37,7 +40,7 @@ pub struct BayesModel {
pub ham_learns: u32,
}
#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BayesClassifier {
pub min_token_hits: u32,
pub min_tokens: u32,
@ -47,14 +50,14 @@ pub struct BayesClassifier {
#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone, PartialEq, Eq)]
pub struct TokenHash {
h1: u64,
h2: u64,
pub h1: u64,
pub h2: u64,
}
#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone)]
pub struct Weights {
spam: u32,
ham: u32,
pub spam: u32,
pub ham: u32,
}
impl BayesClassifier {
@ -73,3 +76,32 @@ impl Default for BayesClassifier {
Self::new()
}
}
impl From<Gram<'_>> for TokenHash {
fn from(value: Gram<'_>) -> Self {
match value {
Gram::Uni { t1 } => TokenHash {
h1: xxhash_rust::xxh3::xxh3_64(t1.as_bytes()),
h2: farmhash::hash64(t1.as_bytes()),
},
Gram::Bi { t1, t2, .. } => {
let mut buf = Vec::with_capacity(t1.len() + t2.len() + 1);
buf.extend_from_slice(t1.as_bytes());
buf.push(b' ');
buf.extend_from_slice(t2.as_bytes());
TokenHash {
h1: xxhash_rust::xxh3::xxh3_64(&buf),
h2: farmhash::hash64(&buf),
}
}
}
}
}
impl std::hash::Hash for TokenHash {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
state.write_u64(self.h1 ^ self.h2);
}
}
impl nohash::IsEnabled for TokenHash {}

File diff suppressed because it is too large Load diff

View file

@ -21,7 +21,7 @@
* for more details.
*/
use crate::transformers::osb::OsbToken;
use crate::tokenizers::osb::OsbToken;
use super::{BayesModel, TokenHash};

View file

@ -21,6 +21,10 @@
* for more details.
*/
pub mod detect;
pub mod stemmer;
pub mod stopwords;
use std::borrow::Cow;
use crate::tokenizers::{
@ -29,9 +33,6 @@ use crate::tokenizers::{
use self::detect::LanguageDetector;
pub mod detect;
pub mod stemmer;
pub type LanguageTokenizer<'x> = Box<dyn Iterator<Item = Token<Cow<'x, str>>> + 'x>;
impl Language {
@ -131,57 +132,9 @@ pub enum Language {
impl Language {
pub fn from_iso_639(code: &str) -> Option<Self> {
match code.split_once('-').map(|c| c.0).unwrap_or(code) {
"en" => Language::English,
"es" => Language::Spanish,
"pt" => Language::Portuguese,
"it" => Language::Italian,
"fr" => Language::French,
"de" => Language::German,
"ru" => Language::Russian,
"zh" => Language::Mandarin,
"ja" => Language::Japanese,
"ar" => Language::Arabic,
"hi" => Language::Hindi,
"ko" => Language::Korean,
"bn" => Language::Bengali,
"he" => Language::Hebrew,
"ur" => Language::Urdu,
"fa" => Language::Persian,
"ml" => Language::Malayalam,
"or" => Language::Oriya,
"my" => Language::Burmese,
"ne" => Language::Nepali,
"si" => Language::Sinhalese,
"km" => Language::Khmer,
"tk" => Language::Turkmen,
"am" => Language::Amharic,
"az" => Language::Azerbaijani,
"id" => Language::Indonesian,
"te" => Language::Telugu,
"ta" => Language::Tamil,
"vi" => Language::Vietnamese,
"gu" => Language::Gujarati,
"pa" => Language::Punjabi,
"uz" => Language::Uzbek,
"hy" => Language::Armenian,
"ka" => Language::Georgian,
"la" => Language::Latin,
"sl" => Language::Slovene,
"hr" => Language::Croatian,
"sr" => Language::Serbian,
"mk" => Language::Macedonian,
"lt" => Language::Lithuanian,
"lv" => Language::Latvian,
"et" => Language::Estonian,
"tl" => Language::Tagalog,
"af" => Language::Afrikaans,
"zu" => Language::Zulu,
"sn" => Language::Shona,
"ak" => Language::Akan,
_ => return None,
}
.into()
LANG_ISO
.get(code.split_once('-').map(|c| c.0).unwrap_or(code))
.copied()
}
}
@ -200,3 +153,53 @@ impl Language {
}
}
}
static LANG_ISO: phf::Map<&'static str, Language> = phf::phf_map! {
"en" => Language::English,
"es" => Language::Spanish,
"pt" => Language::Portuguese,
"it" => Language::Italian,
"fr" => Language::French,
"de" => Language::German,
"ru" => Language::Russian,
"zh" => Language::Mandarin,
"ja" => Language::Japanese,
"ar" => Language::Arabic,
"hi" => Language::Hindi,
"ko" => Language::Korean,
"bn" => Language::Bengali,
"he" => Language::Hebrew,
"ur" => Language::Urdu,
"fa" => Language::Persian,
"ml" => Language::Malayalam,
"or" => Language::Oriya,
"my" => Language::Burmese,
"ne" => Language::Nepali,
"si" => Language::Sinhalese,
"km" => Language::Khmer,
"tk" => Language::Turkmen,
"am" => Language::Amharic,
"az" => Language::Azerbaijani,
"id" => Language::Indonesian,
"te" => Language::Telugu,
"ta" => Language::Tamil,
"vi" => Language::Vietnamese,
"gu" => Language::Gujarati,
"pa" => Language::Punjabi,
"uz" => Language::Uzbek,
"hy" => Language::Armenian,
"ka" => Language::Georgian,
"la" => Language::Latin,
"sl" => Language::Slovene,
"hr" => Language::Croatian,
"sr" => Language::Serbian,
"mk" => Language::Macedonian,
"lt" => Language::Lithuanian,
"lv" => Language::Latvian,
"et" => Language::Estonian,
"tl" => Language::Tagalog,
"af" => Language::Afrikaans,
"zu" => Language::Zulu,
"sn" => Language::Shona,
"ak" => Language::Akan,
};

View file

@ -70,7 +70,7 @@ impl<'x> Iterator for Stemmer<'x> {
}
}
static STEMMER_MAP: &[Option<Algorithm>] = &[
pub static STEMMER_MAP: &[Option<Algorithm>] = &[
None, // Esperanto = 0,
Some(Algorithm::English), // English = 1,
Some(Algorithm::Russian), // Russian = 2,

File diff suppressed because it is too large Load diff

View file

@ -1,59 +1,52 @@
use ahash::AHashSet;
pub mod bayes;
pub mod language;
pub mod tokenizers;
pub mod transformers;
#[derive(Debug, Clone, Default)]
pub struct PublicSuffix {
pub suffixes: AHashSet<String>,
pub exceptions: AHashSet<String>,
pub wildcards: Vec<String>,
}
impl PublicSuffix {
pub fn contains(&self, suffix: &str) -> bool {
self.suffixes.contains(suffix)
|| (!self.exceptions.contains(suffix)
&& self.wildcards.iter().any(|w| suffix.ends_with(w)))
}
}
#[cfg(test)]
mod test {
use std::fs;
use utils::suffixlist::PublicSuffix;
use crate::{
bayes::{bloom::BloomHasher, BayesClassifier, BayesModel},
transformers::osb::{OsbToken, OsbTokenizer},
bayes::{tokenize::BayesTokenizer, BayesClassifier, BayesModel},
tokenizers::osb::{OsbToken, OsbTokenizer},
};
#[test]
#[ignore]
fn train() {
let db = fs::read_to_string("spam_or_not_spam.csv").unwrap();
let db =
fs::read_to_string("/Users/me/code/mail-server/_ignore/spam_or_not_spam.csv").unwrap();
let mut bayes = BayesModel::default();
let suffixes = PublicSuffix::default();
for line in db.lines() {
let (text, is_spam) = line.rsplit_once(',').unwrap();
let is_spam = is_spam == "1";
bayes.train(
BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)),
OsbTokenizer::new(BayesTokenizer::new(text, &suffixes), 5),
is_spam,
);
}
println!("Ham: {} Spam: {}", bayes.ham_learns, bayes.spam_learns,);
fs::write("spam_or_not_spam.bin", bincode::serialize(&bayes).unwrap()).unwrap();
fs::write(
"/Users/me/code/mail-server/_ignore/spam_or_not_spam.bin",
bincode::serialize(&bayes).unwrap(),
)
.unwrap();
}
#[test]
#[ignore]
fn classify() {
let model: BayesModel =
bincode::deserialize(&fs::read("spam_or_not_spam.bin").unwrap()).unwrap();
let model: BayesModel = bincode::deserialize(
&fs::read("/Users/me/code/mail-server/_ignore/spam_or_not_spam.bin").unwrap(),
)
.unwrap();
let bayes = BayesClassifier::new();
let suffixes = PublicSuffix::default();
for text in [
"i am attaching to this email a presentation to integrate the spreadsheet into our server",
@ -65,7 +58,7 @@ mod test {
"{:?} -> {}",
text,
bayes
.classify(BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)).filter_map(|x| model.weights.get(&x.inner).map(|w| {
.classify(OsbTokenizer::new(BayesTokenizer::new(text, &suffixes), 5).filter_map(|x| model.weights.get(&x.inner).map(|w| {
OsbToken {
idx: x.idx,
inner: *w,

View file

@ -29,7 +29,7 @@ use super::{InnerToken, Token};
use lazy_static::lazy_static;
lazy_static! {
static ref JIEBA: Jieba = Jieba::new();
pub static ref JIEBA: Jieba = Jieba::new();
}
pub struct ChineseTokenizer<'x, T, I>

View file

@ -23,6 +23,7 @@
pub mod chinese;
pub mod japanese;
pub mod osb;
pub mod space;
pub mod types;
pub mod word;

View file

@ -0,0 +1,358 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::{borrow::Cow, iter::Peekable};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct OsbToken<T> {
pub inner: T,
pub idx: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Gram<'x> {
Uni { t1: &'x str },
Bi { t1: &'x str, t2: &'x str },
}
pub struct OsbTokenizer<'x, I, R>
where
I: Iterator<Item = Cow<'x, str>>,
R: for<'y> From<Gram<'y>> + 'static,
{
iter: Peekable<I>,
buf: Vec<Option<Cow<'x, str>>>,
window_size: usize,
window_pos: usize,
window_idx: usize,
phantom: std::marker::PhantomData<R>,
}
impl<'x, I, R> OsbTokenizer<'x, I, R>
where
I: Iterator<Item = Cow<'x, str>>,
R: for<'y> From<Gram<'y>> + 'static,
{
pub fn new(iter: I, window_size: usize) -> Self {
Self {
iter: iter.peekable(),
buf: vec![None; window_size],
window_pos: 0,
window_idx: 0,
window_size,
phantom: std::marker::PhantomData,
}
}
}
impl<'x, I, R> Iterator for OsbTokenizer<'x, I, R>
where
I: Iterator<Item = Cow<'x, str>>,
R: for<'y> From<Gram<'y>> + 'static,
{
type Item = OsbToken<R>;
fn next(&mut self) -> Option<Self::Item> {
let end_pos = (self.window_pos + self.window_idx) % self.window_size;
if self.buf[end_pos].is_none() {
self.buf[end_pos] = self.iter.next();
}
let t1 = self.buf[self.window_pos % self.window_size].as_deref()?;
let token = OsbToken {
inner: R::from(if self.window_idx != 0 {
Gram::Bi {
t1,
t2: self.buf[end_pos].as_deref()?,
}
} else {
Gram::Uni { t1 }
}),
idx: self.window_idx,
};
// Increment window index
self.window_idx += 1;
if self.window_idx == self.window_size
|| (self.iter.peek().is_none()
&& self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none())
{
self.buf[self.window_pos % self.window_size] = None;
self.window_idx = 0;
self.window_pos += 1;
}
Some(token)
}
}
#[cfg(test)]
mod test {
use std::borrow::Cow;
use crate::tokenizers::osb::{Gram, OsbToken};
impl From<Gram<'_>> for String {
fn from(value: Gram<'_>) -> Self {
match value {
Gram::Uni { t1 } => t1.to_string(),
Gram::Bi { t1, t2 } => format!("{t1} {t2}"),
}
}
}
#[test]
fn osb_tokenizer() {
assert_eq!(
super::OsbTokenizer::new(
"The quick brown fox jumps over the lazy dog and the lazy cat"
.split_ascii_whitespace()
.map(Cow::from),
5,
)
.collect::<Vec<_>>(),
vec![
OsbToken {
inner: "The".to_string(),
idx: 0
},
OsbToken {
inner: "The quick".to_string(),
idx: 1
},
OsbToken {
inner: "The brown".to_string(),
idx: 2
},
OsbToken {
inner: "The fox".to_string(),
idx: 3
},
OsbToken {
inner: "The jumps".to_string(),
idx: 4
},
OsbToken {
inner: "quick".to_string(),
idx: 0
},
OsbToken {
inner: "quick brown".to_string(),
idx: 1
},
OsbToken {
inner: "quick fox".to_string(),
idx: 2
},
OsbToken {
inner: "quick jumps".to_string(),
idx: 3
},
OsbToken {
inner: "quick over".to_string(),
idx: 4
},
OsbToken {
inner: "brown".to_string(),
idx: 0
},
OsbToken {
inner: "brown fox".to_string(),
idx: 1
},
OsbToken {
inner: "brown jumps".to_string(),
idx: 2
},
OsbToken {
inner: "brown over".to_string(),
idx: 3
},
OsbToken {
inner: "brown the".to_string(),
idx: 4
},
OsbToken {
inner: "fox".to_string(),
idx: 0
},
OsbToken {
inner: "fox jumps".to_string(),
idx: 1
},
OsbToken {
inner: "fox over".to_string(),
idx: 2
},
OsbToken {
inner: "fox the".to_string(),
idx: 3
},
OsbToken {
inner: "fox lazy".to_string(),
idx: 4
},
OsbToken {
inner: "jumps".to_string(),
idx: 0
},
OsbToken {
inner: "jumps over".to_string(),
idx: 1
},
OsbToken {
inner: "jumps the".to_string(),
idx: 2
},
OsbToken {
inner: "jumps lazy".to_string(),
idx: 3
},
OsbToken {
inner: "jumps dog".to_string(),
idx: 4
},
OsbToken {
inner: "over".to_string(),
idx: 0
},
OsbToken {
inner: "over the".to_string(),
idx: 1
},
OsbToken {
inner: "over lazy".to_string(),
idx: 2
},
OsbToken {
inner: "over dog".to_string(),
idx: 3
},
OsbToken {
inner: "over and".to_string(),
idx: 4
},
OsbToken {
inner: "the".to_string(),
idx: 0
},
OsbToken {
inner: "the lazy".to_string(),
idx: 1
},
OsbToken {
inner: "the dog".to_string(),
idx: 2
},
OsbToken {
inner: "the and".to_string(),
idx: 3
},
OsbToken {
inner: "the the".to_string(),
idx: 4
},
OsbToken {
inner: "lazy".to_string(),
idx: 0
},
OsbToken {
inner: "lazy dog".to_string(),
idx: 1
},
OsbToken {
inner: "lazy and".to_string(),
idx: 2
},
OsbToken {
inner: "lazy the".to_string(),
idx: 3
},
OsbToken {
inner: "lazy lazy".to_string(),
idx: 4
},
OsbToken {
inner: "dog".to_string(),
idx: 0
},
OsbToken {
inner: "dog and".to_string(),
idx: 1
},
OsbToken {
inner: "dog the".to_string(),
idx: 2
},
OsbToken {
inner: "dog lazy".to_string(),
idx: 3
},
OsbToken {
inner: "dog cat".to_string(),
idx: 4
},
OsbToken {
inner: "and".to_string(),
idx: 0
},
OsbToken {
inner: "and the".to_string(),
idx: 1
},
OsbToken {
inner: "and lazy".to_string(),
idx: 2
},
OsbToken {
inner: "and cat".to_string(),
idx: 3
},
OsbToken {
inner: "the".to_string(),
idx: 0
},
OsbToken {
inner: "the lazy".to_string(),
idx: 1
},
OsbToken {
inner: "the cat".to_string(),
idx: 2
},
OsbToken {
inner: "lazy".to_string(),
idx: 0
},
OsbToken {
inner: "lazy cat".to_string(),
idx: 1
},
OsbToken {
inner: "cat".to_string(),
idx: 0
}
]
);
}
}

View file

@ -23,7 +23,7 @@
use std::str::CharIndices;
use crate::PublicSuffix;
use utils::suffixlist::PublicSuffix;
use super::Token;
@ -31,35 +31,39 @@ pub struct TypesTokenizer<'x, 'y> {
text: &'x str,
suffixes: &'y PublicSuffix,
iter: CharIndices<'x>,
tokens: Vec<Token<TokenType<'x>>>,
tokens: Vec<Token<TokenType<&'x str>>>,
peek_pos: usize,
last_ch_is_space: bool,
last_token_is_dot: bool,
eof: bool,
tokenize_urls: bool,
tokenize_urls_without_scheme: bool,
tokenize_emails: bool,
tokenize_numbers: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenType<'x> {
Alphabetic(&'x str),
Integer(&'x str),
Alphanumeric(&'x str),
Hexadecimal(&'x str),
pub enum TokenType<T> {
Alphabetic(T),
Integer(T),
Alphanumeric(T),
Hexadecimal(T),
Other(char),
Punctuation(char),
Space,
// Detected types
Url(&'x str),
UrlNoScheme(&'x str),
UrlNoHost(&'x str),
Email(&'x str),
Float(&'x str),
Url(T),
UrlNoScheme(T),
UrlNoHost(T),
Email(T),
Float(T),
}
impl Copy for Token<TokenType<'_>> {}
impl Copy for Token<TokenType<&'_ str>> {}
impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
type Item = Token<TokenType<'x>>;
type Item = Token<TokenType<&'x str>>;
fn next(&mut self) -> Option<Self::Item> {
let token = self.peek()?;
@ -67,7 +71,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
self.last_token_is_dot = matches!(token.word, TokenType::Punctuation('.'));
// Try parsing URL with scheme
if matches!(
if self.tokenize_urls
&& matches!(
token.word,
TokenType::Alphabetic(t) | TokenType::Hexadecimal(t)
if t.len() <= 8 && t.chars().all(|c| c.is_ascii()))
@ -82,7 +87,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
}
// Try parsing email
if token.word.is_email_atom()
if self.tokenize_emails
&& token.word.is_email_atom()
&& self.peek_has_tokens(
&[TokenType::Punctuation('@'), TokenType::Punctuation('.')],
TokenType::Space,
@ -97,7 +103,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
}
// Try parsing URL without scheme
if token.word.is_domain_atom(true)
if self.tokenize_urls_without_scheme
&& token.word.is_domain_atom(true)
&& self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space)
{
if let Some(url) = self.try_parse_url(None) {
@ -109,7 +116,7 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
}
// Try parsing currencies and floating point numbers
if !last_is_dot {
if self.tokenize_numbers && !last_is_dot {
if let Some(num) = self.try_parse_number() {
self.peek_advance();
return Some(num);
@ -132,9 +139,33 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
suffixes,
last_ch_is_space: false,
last_token_is_dot: false,
tokenize_urls: true,
tokenize_urls_without_scheme: true,
tokenize_emails: true,
tokenize_numbers: true,
}
}
pub fn tokenize_urls(mut self, tokenize: bool) -> Self {
self.tokenize_urls = tokenize;
self
}
pub fn tokenize_urls_without_scheme(mut self, tokenize: bool) -> Self {
self.tokenize_urls_without_scheme = tokenize;
self
}
pub fn tokenize_emails(mut self, tokenize: bool) -> Self {
self.tokenize_emails = tokenize;
self
}
pub fn tokenize_numbers(mut self, tokenize: bool) -> Self {
self.tokenize_numbers = tokenize;
self
}
fn consume(&mut self) -> bool {
let mut has_alpha = false;
let mut has_number = false;
@ -212,7 +243,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
}
}
fn next_(&mut self) -> Option<Token<TokenType<'x>>> {
fn next_(&mut self) -> Option<Token<TokenType<&'x str>>> {
if self.tokens.is_empty() && !self.eof {
self.consume();
}
@ -223,7 +254,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
}
}
fn peek(&mut self) -> Option<Token<TokenType<'x>>> {
fn peek(&mut self) -> Option<Token<TokenType<&'x str>>> {
while self.tokens.len() <= self.peek_pos && !self.eof {
self.consume();
}
@ -244,7 +275,11 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
self.peek_pos = 0;
}
fn peek_has_tokens(&mut self, tokens: &[TokenType<'_>], stop_token: TokenType<'_>) -> bool {
fn peek_has_tokens(
&mut self,
tokens: &[TokenType<&'_ str>],
stop_token: TokenType<&'_ str>,
) -> bool {
let mut tokens = tokens.iter().copied();
let mut token = tokens.next().unwrap();
while let Some(t) = self.peek() {
@ -266,8 +301,8 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
fn try_parse_url(
&mut self,
scheme_token: Option<Token<TokenType<'_>>>,
) -> Option<Token<TokenType<'x>>> {
scheme_token: Option<Token<TokenType<&'_ str>>>,
) -> Option<Token<TokenType<&'x str>>> {
let (has_scheme, allow_blank_host) = scheme_token.as_ref().map_or((false, false), |t| {
(
true,
@ -480,7 +515,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
.into()
}
fn try_parse_email(&mut self) -> Option<Token<TokenType<'x>>> {
fn try_parse_email(&mut self) -> Option<Token<TokenType<&'x str>>> {
// Start token is a valid local part atom
let start_token = self.peek()?;
let mut last_is_dot = false;
@ -615,7 +650,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
None
}
fn try_parse_number(&mut self) -> Option<Token<TokenType<'x>>> {
fn try_parse_number(&mut self) -> Option<Token<TokenType<&'x str>>> {
self.peek_rewind();
let mut start_pos = usize::MAX;
let mut end_pos = usize::MAX;
@ -698,7 +733,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
}
}
impl<'x> TokenType<'x> {
impl<T> TokenType<T> {
fn is_email_atom(&self) -> bool {
matches!(
self,
@ -744,7 +779,8 @@ impl<'x> TokenType<'x> {
#[cfg(test)]
mod test {
use crate::PublicSuffix;
use utils::suffixlist::PublicSuffix;
use super::{TokenType, TypesTokenizer};

View file

@ -1,24 +0,0 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
pub mod osb;

View file

@ -1,467 +0,0 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::iter::Peekable;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct OsbToken<T> {
pub inner: T,
pub idx: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Gram<'x> {
Uni { t1: &'x str },
Bi { t1: &'x str, t2: &'x str },
}
pub struct OsbTokenizer<'x, I>
where
I: Iterator<Item = &'x str>,
{
iter: Peekable<I>,
buf: Vec<Option<&'x str>>,
window_size: usize,
window_pos: usize,
window_idx: usize,
}
impl<'x, I> OsbTokenizer<'x, I>
where
I: Iterator<Item = &'x str>,
{
pub fn new(iter: I, window_size: usize) -> Self {
Self {
iter: iter.peekable(),
buf: vec![None; window_size],
window_pos: 0,
window_idx: 0,
window_size,
}
}
}
impl<'x, I> Iterator for OsbTokenizer<'x, I>
where
I: Iterator<Item = &'x str>,
{
type Item = OsbToken<Gram<'x>>;
fn next(&mut self) -> Option<Self::Item> {
let end_pos = (self.window_pos + self.window_idx) % self.window_size;
if self.buf[end_pos].is_none() {
self.buf[end_pos] = self.iter.next();
}
let t1 = self.buf[self.window_pos % self.window_size]?;
let token = OsbToken {
inner: if self.window_idx != 0 {
Gram::Bi {
t1,
t2: self.buf[end_pos]?,
}
} else {
Gram::Uni { t1 }
},
idx: self.window_idx,
};
// Increment window
self.window_idx += 1;
if self.window_idx == self.window_size
|| (self.iter.peek().is_none()
&& self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none())
{
self.buf[self.window_pos % self.window_size] = None;
self.window_idx = 0;
self.window_pos += 1;
}
Some(token)
}
}
#[cfg(test)]
mod test {
use crate::transformers::osb::{Gram, OsbToken};
#[test]
fn osb_tokenizer() {
assert_eq!(
super::OsbTokenizer::new(
"The quick brown fox jumps over the lazy dog and the lazy cat"
.split_ascii_whitespace(),
5
)
.collect::<Vec<_>>(),
vec![
OsbToken {
inner: Gram::Uni { t1: "The" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "The",
t2: "quick"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "The",
t2: "brown"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "The",
t2: "fox"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "The",
t2: "jumps"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "quick" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "quick",
t2: "brown"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "quick",
t2: "fox"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "quick",
t2: "jumps"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "quick",
t2: "over"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "brown" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "brown",
t2: "fox"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "brown",
t2: "jumps"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "brown",
t2: "over"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "brown",
t2: "the"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "fox" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "fox",
t2: "jumps"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "fox",
t2: "over"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "fox",
t2: "the"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "fox",
t2: "lazy"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "jumps" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "jumps",
t2: "over"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "jumps",
t2: "the"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "jumps",
t2: "lazy"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "jumps",
t2: "dog"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "over" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "over",
t2: "the"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "over",
t2: "lazy"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "over",
t2: "dog"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "over",
t2: "and"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "the" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "lazy"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "dog"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "and"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "the"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "lazy" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "dog"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "and"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "the"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "lazy"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "dog" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "dog",
t2: "and"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "dog",
t2: "the"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "dog",
t2: "lazy"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "dog",
t2: "cat"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "and" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "and",
t2: "the"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "and",
t2: "lazy"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "and",
t2: "cat"
},
idx: 3
},
OsbToken {
inner: Gram::Uni { t1: "the" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "lazy"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "cat"
},
idx: 2
},
OsbToken {
inner: Gram::Uni { t1: "lazy" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "cat"
},
idx: 1
},
OsbToken {
inner: Gram::Uni { t1: "cat" },
idx: 0
}
]
);
}
}

View file

@ -13,6 +13,7 @@ resolver = "2"
[dependencies]
utils = { path = "../utils" }
nlp = { path = "../nlp" }
directory = { path = "../directory" }
mail-auth = { git = "https://github.com/stalwartlabs/mail-auth" }
mail-send = { git = "https://github.com/stalwartlabs/mail-send", default-features = false, features = ["cram-md5", "skip-ehlo"] }
@ -50,7 +51,6 @@ num_cpus = "1.15.0"
lazy_static = "1.4"
whatlang = "0.16"
imagesize = "0.12"
linkify = "0.10"
idna = "0.4"
decancer = "1.6.1"
unicode-security = "0.1.0"

View file

@ -39,7 +39,7 @@ use std::{
time::Duration,
};
use ahash::{AHashMap, AHashSet};
use ahash::AHashMap;
use directory::{Directory, DirectoryConfig, Lookup};
use mail_auth::{
common::crypto::{Ed25519Key, RsaKey, Sha256},
@ -541,13 +541,6 @@ pub enum VerifyStrategy {
Disable,
}
#[derive(Debug, Clone, Default)]
pub struct PublicSuffix {
pub suffixes: AHashSet<String>,
pub exceptions: AHashSet<String>,
pub wildcards: Vec<String>,
}
#[derive(Default)]
pub struct ConfigContext<'x> {
pub servers: &'x [Server],

View file

@ -34,9 +34,7 @@ use mail_auth::{
};
use crate::{core::Resolvers, outbound::dane::DnssecResolver};
use utils::config::Config;
use super::PublicSuffix;
use utils::{config::Config, suffixlist::PublicSuffix};
pub trait ConfigResolver {
fn build_resolvers(&self) -> super::Result<Resolvers>;
@ -108,9 +106,9 @@ impl ConfigResolver for Config {
}
fn parse_public_suffix(&self) -> super::Result<PublicSuffix> {
let mut ps = PublicSuffix::default();
let mut has_values = false;
for (_, value) in self.values("resolver.public-suffix") {
has_values = true;
let bytes = if value.starts_with("https://") || value.starts_with("http://") {
match tokio::task::block_in_place(|| {
reqwest::blocking::get(value).and_then(|r| {
@ -175,20 +173,7 @@ impl ConfigResolver for Config {
match String::from_utf8(bytes) {
Ok(list) => {
for line in list.lines() {
let line = line.trim().to_lowercase();
if !line.starts_with("//") {
if let Some(domain) = line.strip_prefix('*') {
ps.wildcards.push(domain.to_string());
} else if let Some(domain) = line.strip_prefix('!') {
ps.exceptions.insert(domain.to_string());
} else {
ps.suffixes.insert(line.to_string());
}
}
}
return Ok(ps);
return Ok(PublicSuffix::from(list.as_str()));
}
Err(err) => {
tracing::warn!(
@ -200,16 +185,10 @@ impl ConfigResolver for Config {
}
}
tracing::warn!("Failed to parse public suffixes from any source.");
if has_values {
tracing::warn!("Failed to parse public suffixes from any source.");
}
Ok(ps)
}
}
impl PublicSuffix {
pub fn contains(&self, suffix: &str) -> bool {
self.suffixes.contains(suffix)
|| (!self.exceptions.contains(suffix)
&& self.wildcards.iter().any(|w| suffix.ends_with(w)))
Ok(PublicSuffix::default())
}
}

View file

@ -21,25 +21,33 @@
* for more details.
*/
use std::time::Duration;
use std::{sync::Arc, time::Duration};
use directory::Lookup;
use nlp::bayes::{cache::BayesTokenCache, BayesClassifier};
use sieve::{compiler::grammar::Capability, Compiler, Runtime};
use crate::{
core::{SieveConfig, SieveCore},
scripts::{functions::register_functions, plugins::RegisterSievePlugins},
};
use utils::config::{utils::AsKey, Config};
use utils::{
config::{utils::AsKey, Config},
suffixlist::PublicSuffix,
};
use super::{resolver::ConfigResolver, ConfigContext, PublicSuffix};
use super::{resolver::ConfigResolver, ConfigContext};
pub trait ConfigSieve {
fn parse_sieve(&self, ctx: &mut ConfigContext) -> super::Result<SieveCore>;
}
#[derive(Clone, Default)]
pub struct SieveContext {
pub psl: PublicSuffix,
pub bayes_classify: BayesClassifier,
pub bayes_cache: BayesTokenCache,
pub lookup_classify: Arc<Lookup>,
pub lookup_train: Arc<Lookup>,
}
impl ConfigSieve for Config {
@ -48,6 +56,29 @@ impl ConfigSieve for Config {
let mut fnc_map = register_functions().register_plugins();
let sieve_ctx = SieveContext {
psl: self.parse_public_suffix()?,
bayes_classify: BayesClassifier {
min_token_hits: self.property_or_static("bayes.min-token-hits", "2")?,
min_tokens: self.property_or_static("bayes.min-tokens", "11")?,
min_prob_strength: self.property_or_static("bayes.min-prob-strength", "0.05")?,
min_learns: self.property_or_static("bayes.min-learns", "200")?,
},
bayes_cache: BayesTokenCache::new(
self.property_or_static("bayes.cache.capacity", "8192")?,
self.property_or_static("bayes.cache.ttl.positive", "1h")?,
self.property_or_static("bayes.cache.ttl.negative", "1h")?,
),
lookup_classify: ctx
.directory
.lookups
.get("bayes.tokens.classify")
.ok_or("No lookup found for key bayes.tokens.classify.".to_string())?
.clone(),
lookup_train: ctx
.directory
.lookups
.get("bayes.tokens.train")
.ok_or("No lookup found for key bayes.tokens.train.".to_string())?
.clone(),
};
// Allocate compiler and runtime

View file

@ -24,7 +24,6 @@
use core::panic;
use std::{sync::Arc, time::Duration};
use ahash::AHashMap;
use directory::Lookup;
use mail_auth::common::headers::HeaderWriter;
use sieve::{
@ -68,8 +67,6 @@ impl SMTP {
let mut modifications = vec![];
let mut keep_id = usize::MAX;
let mut plugin_data = AHashMap::new();
// Start event loop
while let Some(result) = instance.run(input) {
match result {
@ -125,7 +122,6 @@ impl SMTP {
span: &span,
handle: &handle,
core: self,
data: &mut plugin_data,
message: instance.message(),
arguments,
},

View file

@ -21,11 +21,12 @@
* for more details.
*/
use nlp::tokenizers::types::{TokenType, TypesTokenizer};
use sieve::{runtime::Variable, Context};
use crate::{config::scripts::SieveContext, scripts::functions::url::tokenize_email};
use crate::config::scripts::SieveContext;
use super::{html::html_to_tokens, url::tokenize_url, ApplyString};
use super::{html::html_to_tokens, ApplyString};
pub fn fn_trim<'x>(_: &'x Context<'x, SieveContext>, v: Vec<Variable<'x>>) -> Variable<'x> {
v[0].transform(|s| Variable::StringRef(s.trim()))
@ -106,13 +107,49 @@ pub fn fn_tokenize<'x>(
ctx: &'x Context<'x, SieveContext>,
mut v: Vec<Variable<'x>>,
) -> Variable<'x> {
match v[1].to_cow().as_ref() {
"html" => html_to_tokens(v[0].to_cow().as_ref()).into(),
"words" => tokenize_words(&v[0]),
"uri" | "url" => tokenize_url(ctx, v.remove(0), false),
"uri_strict" | "url_strict" => tokenize_url(ctx, v.remove(0), true),
"email" => tokenize_email(v.remove(0)),
_ => Variable::default(),
let (urls, urls_without_scheme, emails) = match v[1].to_cow().as_ref() {
"html" => return html_to_tokens(v[0].to_cow().as_ref()).into(),
"words" => return tokenize_words(&v[0]),
"uri" | "url" => (true, true, true),
"uri_strict" | "url_strict" => (true, false, false),
"email" => (false, false, true),
_ => return Variable::default(),
};
match v.remove(0) {
Variable::StringRef(text) => TypesTokenizer::new(text, &ctx.context().psl)
.tokenize_numbers(false)
.tokenize_urls(urls)
.tokenize_urls_without_scheme(urls_without_scheme)
.tokenize_emails(emails)
.filter_map(|t| match t.word {
TokenType::Url(text) if urls => Variable::StringRef(text).into(),
TokenType::UrlNoScheme(text) if urls_without_scheme => {
Variable::String(format!("https://{text}")).into()
}
TokenType::Email(text) if emails => Variable::StringRef(text).into(),
_ => None,
})
.collect::<Vec<_>>()
.into(),
v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => {
TypesTokenizer::new(v.to_cow().as_ref(), &ctx.context().psl)
.tokenize_numbers(false)
.tokenize_urls(urls)
.tokenize_urls_without_scheme(urls_without_scheme)
.tokenize_emails(emails)
.filter_map(|t| match t.word {
TokenType::Url(text) if urls => Variable::String(text.to_string()).into(),
TokenType::UrlNoScheme(text) if urls_without_scheme => {
Variable::String(format!("https://{text}")).into()
}
TokenType::Email(text) if emails => Variable::String(text.to_string()).into(),
_ => None,
})
.collect::<Vec<_>>()
.into()
}
v => v,
}
}

View file

@ -21,94 +21,13 @@
* for more details.
*/
use std::net::IpAddr;
use hyper::Uri;
use linkify::LinkKind;
use sieve::{runtime::Variable, Context};
use crate::config::scripts::SieveContext;
use super::ApplyString;
pub fn tokenize_url<'x>(
ctx: &'x Context<'x, SieveContext>,
v: Variable<'x>,
must_have_scheme: bool,
) -> Variable<'x> {
match v {
Variable::StringRef(text) => linkify::LinkFinder::new()
.url_must_have_scheme(must_have_scheme)
.kinds(&[LinkKind::Url])
.links(text.as_ref())
.filter_map(|url| filter_url(url.as_str(), must_have_scheme, ctx))
.collect::<Vec<_>>()
.into(),
v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => {
linkify::LinkFinder::new()
.url_must_have_scheme(must_have_scheme)
.kinds(&[LinkKind::Url])
.links(v.to_cow().as_ref())
.filter_map(|url| {
filter_url(url.as_str(), must_have_scheme, ctx).map(|v| v.into_owned())
})
.collect::<Vec<_>>()
.into()
}
v => v,
}
}
pub fn tokenize_email(v: Variable<'_>) -> Variable<'_> {
match v {
Variable::StringRef(text) => linkify::LinkFinder::new()
.email_domain_must_have_dot(true)
.kinds(&[LinkKind::Email])
.links(text.as_ref())
.map(|email| Variable::StringRef(email.as_str()))
.collect::<Vec<_>>()
.into(),
v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => {
linkify::LinkFinder::new()
.email_domain_must_have_dot(true)
.kinds(&[LinkKind::Email])
.links(v.to_cow().as_ref())
.map(|email| Variable::String(email.as_str().to_string()))
.collect::<Vec<_>>()
.into()
}
v => v,
}
}
fn filter_url<'x, 'y>(
url: &'x str,
must_have_scheme: bool,
ctx: &'y Context<'y, SieveContext>,
) -> Option<Variable<'x>> {
if must_have_scheme || url.contains("://") {
Some(Variable::StringRef(url))
} else {
// Filter out possible URLs without a valid TLD
let host = url.split_once('/').map_or(url, |(f, _)| f);
if (host
.as_bytes()
.first()
.map_or(true, |ch| ch.is_ascii_hexdigit())
&& host.parse::<IpAddr>().is_ok())
|| ctx
.context()
.psl
.contains(host.rsplit_once('.').map_or(host, |(_, tld)| tld))
|| host.ends_with(".onion")
{
Some(Variable::String(format!("https://{url}")))
} else {
None
}
}
}
pub fn fn_uri_part<'x>(_: &'x Context<'x, SieveContext>, v: Vec<Variable<'x>>) -> Variable<'x> {
let part = v[1].to_cow();
v[0].transform(|uri| {

View file

@ -0,0 +1,206 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use directory::{DatabaseColumn, Lookup};
use nlp::{
bayes::{cache::BayesTokenCache, tokenize::BayesTokenizer, BayesModel, TokenHash, Weights},
tokenizers::osb::{OsbToken, OsbTokenizer},
};
use sieve::{runtime::Variable, FunctionMap};
use tokio::runtime::Handle;
use crate::config::scripts::SieveContext;
use super::PluginContext;
pub fn register_train(plugin_id: u32, fnc_map: &mut FunctionMap<SieveContext>) {
fnc_map.set_external_function("bayes_train", plugin_id, 2);
}
pub fn register_untrain(plugin_id: u32, fnc_map: &mut FunctionMap<SieveContext>) {
fnc_map.set_external_function("bayes_untrain", plugin_id, 2);
}
pub fn register_classify(plugin_id: u32, fnc_map: &mut FunctionMap<SieveContext>) {
fnc_map.set_external_function("bayes_classify", plugin_id, 1);
}
pub fn exec_train(ctx: PluginContext<'_>) -> Variable<'static> {
train(ctx, true)
}
pub fn exec_untrain(ctx: PluginContext<'_>) -> Variable<'static> {
train(ctx, false)
}
fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable<'static> {
let mut arguments = ctx.arguments.into_iter();
let text = arguments.next().unwrap().into_string();
if text.is_empty() {
return false.into();
}
let handle = ctx.handle;
let ctx = ctx.core.sieve.runtime.context();
// Train the model
let is_spam = arguments.next().unwrap().to_bool();
let mut model = BayesModel::default();
model.train(
OsbTokenizer::new(BayesTokenizer::new(text.as_ref(), &ctx.psl), 5),
is_spam,
);
if model.weights.is_empty() {
return false.into();
}
// Update weight and invalidate cache
let upsert = &ctx.lookup_train;
for (hash, weights) in model.weights {
let (s_weight, h_weight) = if is_train {
(weights.spam as i64, weights.ham as i64)
} else {
(-(weights.spam as i64), -(weights.ham as i64))
};
if handle
.block_on(upsert.lookup(&[
hash.h1.into(),
hash.h2.into(),
s_weight.into(),
h_weight.into(),
]))
.is_none()
{
return false.into();
}
ctx.bayes_cache.invalidate(&hash);
}
// Update training counts
let train_val = if is_train { 1i64 } else { -1i64 };
let (spam_count, ham_count) = if is_spam {
(train_val, 0i64)
} else {
(0i64, train_val)
};
if handle
.block_on(upsert.query(&[
0i64.into(),
0i64.into(),
spam_count.into(),
ham_count.into(),
]))
.is_none()
{
return false.into();
}
ctx.bayes_cache.invalidate(&TokenHash::default());
true.into()
}
pub fn exec_classify(ctx: PluginContext<'_>) -> Variable<'static> {
let mut arguments = ctx.arguments.into_iter();
let text = arguments.next().unwrap().into_string();
if text.is_empty() {
return 0.into();
}
let handle = ctx.handle;
let ctx = ctx.core.sieve.runtime.context();
let get_token = &ctx.lookup_classify;
// Obtain training counts
let (spam_learns, ham_learns) = if let Some(weights) =
ctx.bayes_cache
.get_or_update(TokenHash::default(), handle, get_token)
{
(weights.spam, weights.ham)
} else {
return 0.into();
};
// Make sure we have enough training data
if spam_learns < ctx.bayes_classify.min_learns || ham_learns < ctx.bayes_classify.min_learns {
return 0.into();
}
// Classify the text
ctx.bayes_classify
.classify(
OsbTokenizer::<_, TokenHash>::new(BayesTokenizer::new(text.as_ref(), &ctx.psl), 5)
.filter_map(|t| {
OsbToken {
inner: ctx.bayes_cache.get_or_update(t.inner, handle, get_token)?,
idx: t.idx,
}
.into()
}),
ham_learns,
spam_learns,
)
.unwrap_or_default()
.into()
}
trait LookupOrInsert {
fn get_or_update(
&self,
hash: TokenHash,
handle: &Handle,
get_token: &Lookup,
) -> Option<Weights>;
}
impl LookupOrInsert for BayesTokenCache {
fn get_or_update(
&self,
hash: TokenHash,
handle: &Handle,
get_token: &Lookup,
) -> Option<Weights> {
if let Some(weights) = self.get(&hash) {
weights.unwrap_or_default().into()
} else if let Some(result) =
handle.block_on(get_token.query(&[hash.h1.into(), hash.h2.into()]))
{
let mut result = result.into_iter();
match (result.next(), result.next()) {
(Some(DatabaseColumn::Integer(spam)), Some(DatabaseColumn::Integer(ham))) => {
let weights = Weights {
spam: spam as u32,
ham: ham as u32,
};
self.insert_positive(hash, weights);
weights
}
_ => {
self.insert_negative(hash);
Weights::default()
}
}
.into()
} else {
// Something went wrong
None
}
}
}

View file

@ -21,6 +21,7 @@
* for more details.
*/
use directory::DatabaseColumn;
use sieve::{runtime::Variable, FunctionMap};
use crate::config::scripts::SieveContext;
@ -62,15 +63,20 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> {
}
pub fn exec_map(ctx: PluginContext<'_>) -> Variable<'static> {
let lookup_id = ctx.arguments[0].to_cow();
let item = ctx.arguments[1].to_cow();
let mut arguments = ctx.arguments.into_iter();
let lookup_id = arguments.next().unwrap().into_cow();
let items = match arguments.next().unwrap() {
Variable::Array(l) => l.into_iter().map(DatabaseColumn::from).collect(),
Variable::ArrayRef(l) => l.iter().map(DatabaseColumn::from).collect(),
v => vec![DatabaseColumn::from(v)],
};
let span = ctx.span;
if !lookup_id.is_empty() && !item.is_empty() {
if !lookup_id.is_empty() && !items.is_empty() {
if let Some(lookup) = ctx.core.sieve.lookup.get(lookup_id.as_ref()) {
return ctx
.handle
.block_on(lookup.lookup(item.as_ref()))
.block_on(lookup.lookup(&items))
.unwrap_or_default();
} else {
tracing::warn!(

View file

@ -21,13 +21,13 @@
* for more details.
*/
pub mod bayes;
pub mod dns;
pub mod exec;
pub mod http;
pub mod lookup;
pub mod query;
use ahash::AHashMap;
use mail_parser::Message;
use sieve::{runtime::Variable, FunctionMap, Input};
use tokio::runtime::Handle;
@ -41,12 +41,11 @@ pub struct PluginContext<'x> {
pub span: &'x tracing::Span,
pub handle: &'x Handle,
pub core: &'x SMTP,
pub data: &'x mut AHashMap<String, String>,
pub message: &'x Message<'x>,
pub arguments: Vec<Variable<'static>>,
}
const PLUGINS_EXEC: [ExecPluginFnc; 7] = [
const PLUGINS_EXEC: [ExecPluginFnc; 10] = [
query::exec,
exec::exec,
lookup::exec,
@ -54,8 +53,11 @@ const PLUGINS_EXEC: [ExecPluginFnc; 7] = [
dns::exec,
dns::exec_exists,
http::exec_header,
bayes::exec_train,
bayes::exec_untrain,
bayes::exec_classify,
];
const PLUGINS_REGISTER: [RegisterPluginFnc; 7] = [
const PLUGINS_REGISTER: [RegisterPluginFnc; 10] = [
query::register,
exec::register,
lookup::register,
@ -63,6 +65,9 @@ const PLUGINS_REGISTER: [RegisterPluginFnc; 7] = [
dns::register,
dns::register_exists,
http::register_header,
bayes::register_train,
bayes::register_untrain,
bayes::register_classify,
];
pub trait RegisterSievePlugins {

View file

@ -22,7 +22,7 @@
*/
use crate::config::scripts::SieveContext;
use directory::QueryColumn;
use directory::DatabaseColumn;
use sieve::{runtime::Variable, FunctionMap};
use super::PluginContext;
@ -62,8 +62,12 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> {
return false.into();
}
// Obtain parameters
let parameters = arguments.next().unwrap().into_string_array();
// Obtain arguments
let arguments = match arguments.next().unwrap() {
Variable::Array(l) => l.into_iter().map(DatabaseColumn::from).collect(),
Variable::ArrayRef(l) => l.iter().map(DatabaseColumn::from).collect(),
v => vec![DatabaseColumn::from(v)],
};
// Run query
if query
@ -71,12 +75,9 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> {
.get(..6)
.map_or(false, |q| q.eq_ignore_ascii_case(b"SELECT"))
{
if let Ok(mut query_columns) = ctx.handle.block_on(directory.query(
&query,
&parameters.iter().map(String::as_str).collect::<Vec<_>>(),
)) {
if let Ok(mut query_columns) = ctx.handle.block_on(directory.query(&query, &arguments)) {
match query_columns.len() {
1 if !matches!(query_columns.first(), Some(QueryColumn::Null)) => {
1 if !matches!(query_columns.first(), Some(DatabaseColumn::Null)) => {
query_columns.pop().map(Variable::from).unwrap()
}
0 => Variable::default(),
@ -87,10 +88,7 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> {
}
} else {
ctx.handle
.block_on(directory.lookup(
&query,
&parameters.iter().map(String::as_str).collect::<Vec<_>>(),
))
.block_on(directory.lookup(&query, &arguments))
.is_ok()
.into()
}

View file

@ -298,6 +298,18 @@ impl ParseValue for u64 {
}
}
impl ParseValue for f64 {
fn parse_value(key: impl AsKey, value: &str) -> super::Result<Self> {
value.parse().map_err(|_| {
format!(
"Invalid floating point value {:?} for property {:?}.",
value,
key.as_key()
)
})
}
}
impl ParseValue for u16 {
fn parse_value(key: impl AsKey, value: &str) -> super::Result<Self> {
value.parse().map_err(|_| {

View file

@ -30,6 +30,7 @@ pub mod config;
pub mod ipc;
pub mod listener;
pub mod map;
pub mod suffixlist;
use opentelemetry::{
sdk::{

View file

@ -0,0 +1,59 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use ahash::AHashSet;
#[derive(Debug, Clone, Default)]
pub struct PublicSuffix {
pub suffixes: AHashSet<String>,
pub exceptions: AHashSet<String>,
pub wildcards: Vec<String>,
}
impl PublicSuffix {
pub fn contains(&self, suffix: &str) -> bool {
self.suffixes.contains(suffix)
|| (!self.exceptions.contains(suffix)
&& self.wildcards.iter().any(|w| suffix.ends_with(w)))
}
}
impl From<&str> for PublicSuffix {
fn from(list: &str) -> Self {
let mut ps = PublicSuffix::default();
for line in list.lines() {
let line = line.trim().to_lowercase();
if !line.starts_with("//") {
if let Some(domain) = line.strip_prefix('*') {
ps.wildcards.push(domain.to_string());
} else if let Some(domain) = line.strip_prefix('!') {
ps.exceptions.insert(domain.to_string());
} else {
ps.suffixes.insert(line.to_string());
}
}
}
ps.suffixes.insert("onion".to_string());
ps
}
}

View file

@ -1,6 +1,5 @@
# Mailing list scores
let "ml_score" "count(header.List-Id:List-Archive:List-Owner:List-Help:List-Post:X-Loop:List-Subscribe:List-Unsubscribe[*].exists) * 0.125";
eval "print('ml_score: ' + ml_score)";
if eval "ml_score < 1" {
if eval "header.List-Id.exists" {
let "ml_score" "ml_score + 0.50";