mirror of
https://github.com/stalwartlabs/mail-server.git
synced 2025-12-09 12:55:57 +08:00
Bayes classifier
This commit is contained in:
parent
3d9efd363a
commit
ace58f74eb
41 changed files with 6737 additions and 934 deletions
64
Cargo.lock
generated
64
Cargo.lock
generated
|
|
@ -93,9 +93,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.1"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
|
||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
|
@ -2676,15 +2676,6 @@ version = "0.5.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
|
||||
|
||||
[[package]]
|
||||
name = "linkify"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1dfa36d52c581e9ec783a7ce2a5e0143da6237be5811a0b3153fedfdbe9f780"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.10"
|
||||
|
|
@ -2994,11 +2985,16 @@ dependencies = [
|
|||
"farmhash",
|
||||
"jieba-rs",
|
||||
"lazy_static",
|
||||
"lru-cache",
|
||||
"nohash",
|
||||
"parking_lot",
|
||||
"phf",
|
||||
"rust-stemmers",
|
||||
"serde",
|
||||
"siphasher 1.0.0",
|
||||
"tinysegmenter",
|
||||
"tokio",
|
||||
"utils",
|
||||
"whatlang",
|
||||
"xxhash-rust",
|
||||
]
|
||||
|
|
@ -3294,9 +3290,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "ordered-float"
|
||||
version = "3.9.1"
|
||||
version = "3.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a54938017eacd63036332b4ae5c8a49fc8c0c1d6d629893057e4f13609edd06"
|
||||
checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
|
@ -3630,9 +3626,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.68"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c"
|
||||
checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
|
@ -3799,9 +3795,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rasn"
|
||||
version = "0.10.1"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4addd1a49756bcb131c2f686c6c833d2b63e4da7a0df07efd8c3de04b7efbdb2"
|
||||
checksum = "c22b7f7ff0508dae62e1be69fe02f32eb88523090b50ac850637947853cf5b6d"
|
||||
dependencies = [
|
||||
"arrayvec",
|
||||
"bitvec",
|
||||
|
|
@ -3821,9 +3817,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rasn-cms"
|
||||
version = "0.10.1"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e269b4df6eea0f54abd46afacd759b1c13a27e98da98a47ef3c405ef3568b0f5"
|
||||
checksum = "6ecf9f1bb38cbb2a032014f0329d7fd9c2b08f26c4fc882ad642bb95dfefd74f"
|
||||
dependencies = [
|
||||
"rasn",
|
||||
"rasn-pkix",
|
||||
|
|
@ -3831,9 +3827,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rasn-derive"
|
||||
version = "0.10.1"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba8242a16e3461b81333516ad8457906f52fdf21d087417fb59262c9ab406618"
|
||||
checksum = "a1e6ddbc9ada563036d59c322cb0886a9b08b346904eebbcd20af2e01caecee7"
|
||||
dependencies = [
|
||||
"either",
|
||||
"itertools 0.10.5",
|
||||
|
|
@ -3846,9 +3842,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rasn-pkix"
|
||||
version = "0.10.1"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06179c947a63fe9f9f5d73a539dcb13d90c6bdaeb03bd28b90ad796aff9fe6a8"
|
||||
checksum = "b894c903130c4915d79d8d9ce155429b3896b25efa5f81de4d9ab7b1b0f0b7cf"
|
||||
dependencies = [
|
||||
"rasn",
|
||||
]
|
||||
|
|
@ -3904,14 +3900,14 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.9.6"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff"
|
||||
checksum = "d119d7c7ca818f8a53c300863d4f87566aac09943aef5b355bb83969dae75d87"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata 0.3.9",
|
||||
"regex-syntax 0.7.5",
|
||||
"regex-automata 0.4.1",
|
||||
"regex-syntax 0.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -3925,13 +3921,13 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.3.9"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
|
||||
checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax 0.7.5",
|
||||
"regex-syntax 0.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -3942,9 +3938,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
|||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.7.5"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
|
||||
checksum = "c3cbb081b9784b07cceb8824c8583f86db4814d172ab043f3c23f7dc600bf83d"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
|
|
@ -4610,7 +4606,7 @@ checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380"
|
|||
[[package]]
|
||||
name = "sieve-rs"
|
||||
version = "0.3.1"
|
||||
source = "git+https://github.com/stalwartlabs/sieve#c9288b62815610872e9f278b904e34d46124acb5"
|
||||
source = "git+https://github.com/stalwartlabs/sieve#bbb265765ebe92394e429001e90ba2e9b4201f9a"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"bincode",
|
||||
|
|
@ -4690,13 +4686,13 @@ dependencies = [
|
|||
"imagesize",
|
||||
"infer",
|
||||
"lazy_static",
|
||||
"linkify",
|
||||
"lru-cache",
|
||||
"mail-auth",
|
||||
"mail-builder",
|
||||
"mail-parser",
|
||||
"mail-send",
|
||||
"md5",
|
||||
"nlp",
|
||||
"num_cpus",
|
||||
"parking_lot",
|
||||
"rand 0.8.5",
|
||||
|
|
|
|||
10
crates/directory/src/cache/lookup.rs
vendored
10
crates/directory/src/cache/lookup.rs
vendored
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
use mail_send::Credentials;
|
||||
|
||||
use crate::{Directory, Principal, QueryColumn};
|
||||
use crate::{DatabaseColumn, Directory, Principal};
|
||||
|
||||
use super::CachedDirectory;
|
||||
|
||||
|
|
@ -71,11 +71,15 @@ impl<T: Directory> Directory for CachedDirectory<T> {
|
|||
self.inner.expn(address).await
|
||||
}
|
||||
|
||||
async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result<bool> {
|
||||
async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
|
||||
self.inner.lookup(query, params).await
|
||||
}
|
||||
|
||||
async fn query(&self, query: &str, params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
|
||||
async fn query(
|
||||
&self,
|
||||
query: &str,
|
||||
params: &[DatabaseColumn<'_>],
|
||||
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
|
||||
self.inner.query(query, params).await
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
use mail_send::Credentials;
|
||||
use smtp_proto::{AUTH_CRAM_MD5, AUTH_LOGIN, AUTH_OAUTHBEARER, AUTH_PLAIN, AUTH_XOAUTH2};
|
||||
|
||||
use crate::{Directory, DirectoryError, Principal, QueryColumn};
|
||||
use crate::{DatabaseColumn, Directory, DirectoryError, Principal};
|
||||
|
||||
use super::{ImapDirectory, ImapError};
|
||||
|
||||
|
|
@ -98,11 +98,15 @@ impl Directory for ImapDirectory {
|
|||
Err(DirectoryError::unsupported("imap", "expn"))
|
||||
}
|
||||
|
||||
async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result<bool> {
|
||||
async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
|
||||
Err(DirectoryError::unsupported("imap", "lookup"))
|
||||
}
|
||||
|
||||
async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
|
||||
async fn query(
|
||||
&self,
|
||||
_: &str,
|
||||
_: &[DatabaseColumn<'_>],
|
||||
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
|
||||
Err(DirectoryError::unsupported("imap", "query"))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
use ldap3::{ResultEntry, Scope, SearchEntry};
|
||||
use mail_send::Credentials;
|
||||
|
||||
use crate::{Directory, Principal, QueryColumn, Type};
|
||||
use crate::{DatabaseColumn, Directory, Principal, Type};
|
||||
|
||||
use super::{LdapDirectory, LdapMappings};
|
||||
|
||||
|
|
@ -239,13 +239,17 @@ impl Directory for LdapDirectory {
|
|||
Ok(emails)
|
||||
}
|
||||
|
||||
async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result<bool> {
|
||||
async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
|
||||
self.query_(query, params)
|
||||
.await
|
||||
.map(|entry| entry.is_some())
|
||||
}
|
||||
|
||||
async fn query(&self, query: &str, params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
|
||||
async fn query(
|
||||
&self,
|
||||
query: &str,
|
||||
params: &[DatabaseColumn<'_>],
|
||||
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
|
||||
self.query_(query, params).await.map(|entry| {
|
||||
if let Some(entry) = entry {
|
||||
let mut object = String::new();
|
||||
|
|
@ -257,7 +261,7 @@ impl Directory for LdapDirectory {
|
|||
object.push('\n');
|
||||
}
|
||||
}
|
||||
vec![QueryColumn::Text(object)]
|
||||
vec![DatabaseColumn::Text(object.into())]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
|
|
@ -283,7 +287,11 @@ impl Directory for LdapDirectory {
|
|||
}
|
||||
|
||||
impl LdapDirectory {
|
||||
async fn query_(&self, query: &str, params: &[&str]) -> crate::Result<Option<ResultEntry>> {
|
||||
async fn query_(
|
||||
&self,
|
||||
query: &str,
|
||||
params: &[DatabaseColumn<'_>],
|
||||
) -> crate::Result<Option<ResultEntry>> {
|
||||
let mut conn = self.pool.get().await?;
|
||||
tracing::trace!(context = "directory", event = "query", query = query, params = ?params);
|
||||
|
||||
|
|
@ -292,7 +300,7 @@ impl LdapDirectory {
|
|||
for (pos, item) in query.split('?').enumerate() {
|
||||
if pos > 0 {
|
||||
if let Some(param) = params.get(pos - 1) {
|
||||
expanded_query.push_str(param);
|
||||
expanded_query.push_str(param.as_str());
|
||||
}
|
||||
}
|
||||
expanded_query.push_str(item);
|
||||
|
|
|
|||
|
|
@ -21,7 +21,11 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use std::{borrow::Cow, fmt::Debug, sync::Arc};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
fmt::{Debug, Display},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use ahash::{AHashMap, AHashSet};
|
||||
use bb8::RunError;
|
||||
|
|
@ -82,8 +86,12 @@ pub trait Directory: Sync + Send {
|
|||
async fn rcpt(&self, address: &str) -> crate::Result<bool>;
|
||||
async fn vrfy(&self, address: &str) -> Result<Vec<String>>;
|
||||
async fn expn(&self, address: &str) -> Result<Vec<String>>;
|
||||
async fn lookup(&self, query: &str, params: &[&str]) -> Result<bool>;
|
||||
async fn query(&self, query: &str, params: &[&str]) -> Result<Vec<QueryColumn>>;
|
||||
async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> Result<bool>;
|
||||
async fn query(
|
||||
&self,
|
||||
query: &str,
|
||||
params: &[DatabaseColumn<'_>],
|
||||
) -> Result<Vec<DatabaseColumn<'static>>>;
|
||||
|
||||
fn type_name(&self) -> &'static str {
|
||||
std::any::type_name::<Self>()
|
||||
|
|
@ -91,12 +99,12 @@ pub trait Directory: Sync + Send {
|
|||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum QueryColumn {
|
||||
pub enum DatabaseColumn<'x> {
|
||||
Integer(i64),
|
||||
Bool(bool),
|
||||
Float(f64),
|
||||
Text(String),
|
||||
Blob(Vec<u8>),
|
||||
Text(Cow<'x, str>),
|
||||
Blob(Cow<'x, [u8]>),
|
||||
Null,
|
||||
}
|
||||
|
||||
|
|
@ -169,24 +177,24 @@ impl PartialEq for MatchType {
|
|||
impl Eq for MatchType {}
|
||||
|
||||
impl Lookup {
|
||||
pub async fn contains(&self, item: &str) -> Option<bool> {
|
||||
pub async fn contains(&self, item: impl Into<DatabaseColumn<'_>>) -> Option<bool> {
|
||||
match self {
|
||||
Lookup::Directory { directory, query } => {
|
||||
match directory.lookup(query, &[item]).await {
|
||||
match directory.lookup(query, &[item.into()]).await {
|
||||
Ok(result) => result.into(),
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
Lookup::List { list } => list.contains(item).into(),
|
||||
Lookup::Map { map } => map.contains_key(item).into(),
|
||||
Lookup::List { list } => list.contains(item.into().as_str()).into(),
|
||||
Lookup::Map { map } => map.contains_key(item.into().as_str()).into(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn lookup(&self, item: &str) -> Option<Variable<'static>> {
|
||||
pub async fn lookup(&self, items: &[DatabaseColumn<'_>]) -> Option<Variable<'static>> {
|
||||
match self {
|
||||
Lookup::Directory { directory, query } => match directory.query(query, &[item]).await {
|
||||
Lookup::Directory { directory, query } => match directory.query(query, items).await {
|
||||
Ok(mut result) => match result.len() {
|
||||
1 if !matches!(result.first(), Some(QueryColumn::Null)) => {
|
||||
1 if !matches!(result.first(), Some(DatabaseColumn::Null)) => {
|
||||
result.pop().map(Variable::from).unwrap()
|
||||
}
|
||||
0 => Variable::default(),
|
||||
|
|
@ -195,21 +203,34 @@ impl Lookup {
|
|||
.into(),
|
||||
Err(_) => None,
|
||||
},
|
||||
Lookup::List { list } => Some(list.contains(item).into()),
|
||||
Lookup::Map { map } => map.get(item).cloned(),
|
||||
Lookup::List { list } => Some(list.contains(items[0].as_str()).into()),
|
||||
Lookup::Map { map } => map.get(items[0].as_str()).cloned(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn query(
|
||||
&self,
|
||||
items: &[DatabaseColumn<'_>],
|
||||
) -> Option<Vec<DatabaseColumn<'static>>> {
|
||||
match self {
|
||||
Lookup::Directory { directory, query } => match directory.query(query, items).await {
|
||||
Ok(result) => Some(result),
|
||||
Err(_) => None,
|
||||
},
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<QueryColumn> for Variable<'static> {
|
||||
fn from(value: QueryColumn) -> Self {
|
||||
impl<'x> From<DatabaseColumn<'x>> for Variable<'static> {
|
||||
fn from(value: DatabaseColumn) -> Self {
|
||||
match value {
|
||||
QueryColumn::Integer(v) => Variable::Integer(v),
|
||||
QueryColumn::Bool(v) => Variable::Integer(i64::from(v)),
|
||||
QueryColumn::Float(v) => Variable::Float(v),
|
||||
QueryColumn::Text(v) => Variable::String(v),
|
||||
QueryColumn::Blob(v) => Variable::String(v.into_string()),
|
||||
QueryColumn::Null => Variable::StringRef(""),
|
||||
DatabaseColumn::Integer(v) => Variable::Integer(v),
|
||||
DatabaseColumn::Bool(v) => Variable::Integer(i64::from(v)),
|
||||
DatabaseColumn::Float(v) => Variable::Float(v),
|
||||
DatabaseColumn::Text(v) => Variable::String(v.into_owned()),
|
||||
DatabaseColumn::Blob(v) => Variable::String(v.into_owned().into_string()),
|
||||
DatabaseColumn::Null => Variable::StringRef(""),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -457,3 +478,115 @@ impl AddressMapping {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> DatabaseColumn<'x> {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
Self::Text(v) => v.as_ref(),
|
||||
_ => "",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<&'x str> for DatabaseColumn<'x> {
|
||||
fn from(value: &'x str) -> Self {
|
||||
Self::Text(value.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<String> for DatabaseColumn<'x> {
|
||||
fn from(value: String) -> Self {
|
||||
Self::Text(value.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<&'x String> for DatabaseColumn<'x> {
|
||||
fn from(value: &'x String) -> Self {
|
||||
Self::Text(value.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<Cow<'x, str>> for DatabaseColumn<'x> {
|
||||
fn from(value: Cow<'x, str>) -> Self {
|
||||
Self::Text(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<bool> for DatabaseColumn<'x> {
|
||||
fn from(value: bool) -> Self {
|
||||
Self::Bool(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<i64> for DatabaseColumn<'x> {
|
||||
fn from(value: i64) -> Self {
|
||||
Self::Integer(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<u64> for DatabaseColumn<'x> {
|
||||
fn from(value: u64) -> Self {
|
||||
Self::Integer(value as i64)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<u32> for DatabaseColumn<'x> {
|
||||
fn from(value: u32) -> Self {
|
||||
Self::Integer(value as i64)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<f64> for DatabaseColumn<'x> {
|
||||
fn from(value: f64) -> Self {
|
||||
Self::Float(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<&'x [u8]> for DatabaseColumn<'x> {
|
||||
fn from(value: &'x [u8]) -> Self {
|
||||
Self::Blob(value.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<Vec<u8>> for DatabaseColumn<'x> {
|
||||
fn from(value: Vec<u8>) -> Self {
|
||||
Self::Blob(value.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<Variable<'x>> for DatabaseColumn<'x> {
|
||||
fn from(value: Variable<'x>) -> Self {
|
||||
match value {
|
||||
Variable::String(v) => Self::Text(v.into()),
|
||||
Variable::StringRef(v) => Self::Text(v.into()),
|
||||
Variable::Integer(v) => Self::Integer(v),
|
||||
Variable::Float(v) => Self::Float(v),
|
||||
v => Self::Text(v.into_string().into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> From<&'x Variable<'x>> for DatabaseColumn<'x> {
|
||||
fn from(value: &'x Variable<'x>) -> Self {
|
||||
match value {
|
||||
Variable::String(v) => Self::Text(v.into()),
|
||||
Variable::StringRef(v) => Self::Text((*v).into()),
|
||||
Variable::Integer(v) => Self::Integer(*v),
|
||||
Variable::Float(v) => Self::Float(*v),
|
||||
v => Self::Text(v.to_string().into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> Display for DatabaseColumn<'x> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DatabaseColumn::Text(v) => f.write_str(v.as_ref()),
|
||||
DatabaseColumn::Integer(v) => write!(f, "{}", v),
|
||||
DatabaseColumn::Bool(v) => write!(f, "{}", v),
|
||||
DatabaseColumn::Float(v) => write!(f, "{}", v),
|
||||
DatabaseColumn::Blob(v) => write!(f, "{}", String::from_utf8_lossy(v.as_ref())),
|
||||
DatabaseColumn::Null => write!(f, "NULL"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
use mail_send::Credentials;
|
||||
|
||||
use crate::{Directory, DirectoryError, Principal, QueryColumn};
|
||||
use crate::{DatabaseColumn, Directory, DirectoryError, Principal};
|
||||
|
||||
use super::{EmailType, MemoryDirectory};
|
||||
|
||||
|
|
@ -132,11 +132,15 @@ impl Directory for MemoryDirectory {
|
|||
Ok(result)
|
||||
}
|
||||
|
||||
async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result<bool> {
|
||||
async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
|
||||
Err(DirectoryError::unsupported("memory", "lookp"))
|
||||
}
|
||||
|
||||
async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
|
||||
async fn query(
|
||||
&self,
|
||||
_: &str,
|
||||
_: &[DatabaseColumn<'_>],
|
||||
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
|
||||
Err(DirectoryError::unsupported("memory", "query"))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
use mail_send::{smtp::AssertReply, Credentials};
|
||||
use smtp_proto::Severity;
|
||||
|
||||
use crate::{Directory, DirectoryError, Principal, QueryColumn};
|
||||
use crate::{DatabaseColumn, Directory, DirectoryError, Principal};
|
||||
|
||||
use super::{SmtpClient, SmtpDirectory};
|
||||
|
||||
|
|
@ -93,11 +93,15 @@ impl Directory for SmtpDirectory {
|
|||
.await
|
||||
}
|
||||
|
||||
async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result<bool> {
|
||||
async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
|
||||
Err(DirectoryError::unsupported("smtp", "lookup"))
|
||||
}
|
||||
|
||||
async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
|
||||
async fn query(
|
||||
&self,
|
||||
_: &str,
|
||||
_: &[DatabaseColumn<'_>],
|
||||
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
|
||||
Err(DirectoryError::unsupported("smtp", "query"))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ use futures::TryStreamExt;
|
|||
use mail_send::Credentials;
|
||||
use sqlx::{any::AnyRow, postgres::any::AnyTypeInfoKind, Column, Row};
|
||||
|
||||
use crate::{Directory, Principal, QueryColumn, Type};
|
||||
use crate::{DatabaseColumn, Directory, Principal, Type};
|
||||
|
||||
use super::{SqlDirectory, SqlMappings};
|
||||
|
||||
|
|
@ -154,35 +154,39 @@ impl Directory for SqlDirectory {
|
|||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result<bool> {
|
||||
async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result<bool> {
|
||||
self.query_(query, params).await.map(|row| row.is_some())
|
||||
}
|
||||
|
||||
async fn query(&self, query: &str, params: &[&str]) -> crate::Result<Vec<QueryColumn>> {
|
||||
async fn query(
|
||||
&self,
|
||||
query: &str,
|
||||
params: &[DatabaseColumn<'_>],
|
||||
) -> crate::Result<Vec<DatabaseColumn<'static>>> {
|
||||
self.query_(query, params).await.map(|row| {
|
||||
if let Some(row) = row {
|
||||
let mut columns = Vec::with_capacity(row.columns().len());
|
||||
for col in row.columns() {
|
||||
let idx = col.ordinal();
|
||||
columns.push(match col.type_info().kind() {
|
||||
AnyTypeInfoKind::Null => QueryColumn::Null,
|
||||
AnyTypeInfoKind::Null => DatabaseColumn::Null,
|
||||
AnyTypeInfoKind::Bool => {
|
||||
QueryColumn::Bool(row.try_get(idx).unwrap_or_default())
|
||||
DatabaseColumn::Bool(row.try_get(idx).unwrap_or_default())
|
||||
}
|
||||
AnyTypeInfoKind::SmallInt
|
||||
| AnyTypeInfoKind::Integer
|
||||
| AnyTypeInfoKind::BigInt => {
|
||||
QueryColumn::Integer(row.try_get(idx).unwrap_or_default())
|
||||
DatabaseColumn::Integer(row.try_get(idx).unwrap_or_default())
|
||||
}
|
||||
AnyTypeInfoKind::Real | AnyTypeInfoKind::Double => {
|
||||
QueryColumn::Float(row.try_get(idx).unwrap_or_default())
|
||||
}
|
||||
AnyTypeInfoKind::Text => {
|
||||
QueryColumn::Text(row.try_get(idx).unwrap_or_default())
|
||||
}
|
||||
AnyTypeInfoKind::Blob => {
|
||||
QueryColumn::Blob(row.try_get(idx).unwrap_or_default())
|
||||
DatabaseColumn::Float(row.try_get(idx).unwrap_or_default())
|
||||
}
|
||||
AnyTypeInfoKind::Text => DatabaseColumn::Text(
|
||||
row.try_get::<String, _>(idx).unwrap_or_default().into(),
|
||||
),
|
||||
AnyTypeInfoKind::Blob => DatabaseColumn::Blob(
|
||||
row.try_get::<Vec<u8>, _>(idx).unwrap_or_default().into(),
|
||||
),
|
||||
});
|
||||
}
|
||||
columns
|
||||
|
|
@ -204,11 +208,24 @@ impl Directory for SqlDirectory {
|
|||
}
|
||||
|
||||
impl SqlDirectory {
|
||||
async fn query_(&self, query: &str, params: &[&str]) -> crate::Result<Option<AnyRow>> {
|
||||
async fn query_(
|
||||
&self,
|
||||
query: &str,
|
||||
params: &[DatabaseColumn<'_>],
|
||||
) -> crate::Result<Option<AnyRow>> {
|
||||
tracing::trace!(context = "directory", event = "query", query = query, params = ?params);
|
||||
let mut q = sqlx::query(query);
|
||||
for param in params {
|
||||
q = q.bind(param);
|
||||
q = match param {
|
||||
DatabaseColumn::Text(v) => q.bind(v.as_ref()),
|
||||
DatabaseColumn::Integer(v) => q.bind(v),
|
||||
DatabaseColumn::Bool(v) => q.bind(v),
|
||||
DatabaseColumn::Float(v) => q.bind(v),
|
||||
DatabaseColumn::Blob(v) => {
|
||||
q.bind(std::str::from_utf8(v.as_ref()).unwrap_or_default())
|
||||
}
|
||||
DatabaseColumn::Null => q.bind(""),
|
||||
}
|
||||
}
|
||||
|
||||
q.fetch(&self.pool).try_next().await.map_err(Into::into)
|
||||
|
|
|
|||
|
|
@ -37,8 +37,8 @@ p256 = { version = "0.13", features = ["ecdh"] }
|
|||
hkdf = "0.12.3"
|
||||
sha2 = "0.10.1"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls-webpki-roots"]}
|
||||
tokio-tungstenite = "0.20.0"
|
||||
tungstenite = "0.20.0"
|
||||
tokio-tungstenite = "0.20"
|
||||
tungstenite = "0.20"
|
||||
chrono = "0.4"
|
||||
dashmap = "5.4"
|
||||
aes = "0.8.3"
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ edition = "2021"
|
|||
resolver = "2"
|
||||
|
||||
[dependencies]
|
||||
utils = { path = "../utils" }
|
||||
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
|
||||
farmhash = "1.1.5"
|
||||
siphasher = "1.0"
|
||||
|
|
@ -17,3 +18,12 @@ whatlang = "0.16" # Language detection
|
|||
rust-stemmers = "1.2" # Stemmers
|
||||
tinysegmenter = "0.1" # Japanese tokenizer
|
||||
jieba-rs = "0.6" # Chinese stemmer
|
||||
phf = { version = "0.11", features = ["macros"] }
|
||||
lru-cache = "0.1.2"
|
||||
parking_lot = "0.12.1"
|
||||
|
||||
[features]
|
||||
test_mode = []
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { version = "1.23", features = ["full"] }
|
||||
|
|
|
|||
|
|
@ -1,77 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use nohash::IsEnabled;
|
||||
|
||||
use crate::transformers::osb::{Gram, OsbToken};
|
||||
|
||||
use super::TokenHash;
|
||||
|
||||
pub struct BloomHasher<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> {
|
||||
buf: Vec<u8>,
|
||||
tokens: T,
|
||||
}
|
||||
|
||||
impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> BloomHasher<'x, T> {
|
||||
pub fn new(tokens: T) -> Self {
|
||||
Self {
|
||||
buf: Vec::with_capacity(64),
|
||||
tokens,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> Iterator for BloomHasher<'x, T> {
|
||||
type Item = OsbToken<TokenHash>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.tokens.next().map(|token| {
|
||||
let bytes = match token.inner {
|
||||
Gram::Uni { t1 } => t1.as_bytes(),
|
||||
Gram::Bi { t1, t2, .. } => {
|
||||
self.buf.clear();
|
||||
self.buf.extend_from_slice(t1.as_bytes());
|
||||
self.buf.push(b' ');
|
||||
self.buf.extend_from_slice(t2.as_bytes());
|
||||
&self.buf
|
||||
}
|
||||
};
|
||||
|
||||
OsbToken {
|
||||
inner: TokenHash {
|
||||
h1: xxhash_rust::xxh3::xxh3_64(bytes),
|
||||
h2: farmhash::hash64(bytes),
|
||||
},
|
||||
idx: token.idx,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::hash::Hash for TokenHash {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
state.write_u64(self.h1 ^ self.h2);
|
||||
}
|
||||
}
|
||||
|
||||
impl IsEnabled for TokenHash {}
|
||||
107
crates/nlp/src/bayes/cache.rs
Normal file
107
crates/nlp/src/bayes/cache.rs
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::{
|
||||
hash::BuildHasherDefault,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use lru_cache::LruCache;
|
||||
use nohash::NoHashHasher;
|
||||
use parking_lot::Mutex;
|
||||
|
||||
use super::{TokenHash, Weights};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct BayesTokenCache {
|
||||
positive: Mutex<LruCache<TokenHash, CacheItem, BuildHasherDefault<NoHashHasher<TokenHash>>>>,
|
||||
negative: Mutex<LruCache<TokenHash, Instant, BuildHasherDefault<NoHashHasher<TokenHash>>>>,
|
||||
ttl_negative: Duration,
|
||||
ttl_positive: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CacheItem {
|
||||
item: Weights,
|
||||
valid_until: Instant,
|
||||
}
|
||||
|
||||
impl BayesTokenCache {
|
||||
pub fn new(capacity: usize, ttl_positive: Duration, ttl_negative: Duration) -> Self {
|
||||
Self {
|
||||
positive: Mutex::new(LruCache::with_hasher(capacity, Default::default())),
|
||||
negative: Mutex::new(LruCache::with_hasher(capacity, Default::default())),
|
||||
ttl_negative,
|
||||
ttl_positive,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, hash: &TokenHash) -> Option<Option<Weights>> {
|
||||
{
|
||||
let mut pos_cache = self.positive.lock();
|
||||
if let Some(entry) = pos_cache.get_mut(hash) {
|
||||
return if entry.valid_until >= Instant::now() {
|
||||
Some(Some(entry.item))
|
||||
} else {
|
||||
pos_cache.remove(hash);
|
||||
None
|
||||
};
|
||||
}
|
||||
}
|
||||
{
|
||||
let mut neg_cache = self.negative.lock();
|
||||
if let Some(entry) = neg_cache.get_mut(hash) {
|
||||
return if *entry >= Instant::now() {
|
||||
Some(None)
|
||||
} else {
|
||||
neg_cache.remove(hash);
|
||||
None
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
pub fn insert_positive(&self, hash: TokenHash, weights: Weights) {
|
||||
self.positive.lock().insert(
|
||||
hash,
|
||||
CacheItem {
|
||||
item: weights,
|
||||
valid_until: Instant::now() + self.ttl_positive,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub fn insert_negative(&self, hash: TokenHash) {
|
||||
self.negative
|
||||
.lock()
|
||||
.insert(hash, Instant::now() + self.ttl_negative);
|
||||
}
|
||||
|
||||
pub fn invalidate(&self, hash: &TokenHash) {
|
||||
if self.positive.lock().remove(hash).is_none() {
|
||||
self.negative.lock().remove(hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -21,13 +21,14 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use crate::transformers::osb::OsbToken;
|
||||
use crate::tokenizers::osb::OsbToken;
|
||||
|
||||
use super::{BayesClassifier, Weights};
|
||||
|
||||
// Position 0 represents Unigram weights
|
||||
const FEATURE_WEIGHT: [f64; 8] = [1.0, 3125.0, 256.0, 27.0, 1.0, 0.0, 0.0, 0.0];
|
||||
|
||||
// Credits: ported from RSpamd
|
||||
impl BayesClassifier {
|
||||
pub fn classify<T>(&self, tokens: T, ham_learns: u32, spam_learns: u32) -> Option<f64>
|
||||
where
|
||||
|
|
|
|||
|
|
@ -26,8 +26,11 @@ use std::{collections::HashMap, hash::BuildHasherDefault};
|
|||
use nohash::NoHashHasher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub mod bloom;
|
||||
use crate::tokenizers::osb::Gram;
|
||||
|
||||
pub mod cache;
|
||||
pub mod classify;
|
||||
pub mod tokenize;
|
||||
pub mod train;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
|
|
@ -37,7 +40,7 @@ pub struct BayesModel {
|
|||
pub ham_learns: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct BayesClassifier {
|
||||
pub min_token_hits: u32,
|
||||
pub min_tokens: u32,
|
||||
|
|
@ -47,14 +50,14 @@ pub struct BayesClassifier {
|
|||
|
||||
#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone, PartialEq, Eq)]
|
||||
pub struct TokenHash {
|
||||
h1: u64,
|
||||
h2: u64,
|
||||
pub h1: u64,
|
||||
pub h2: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone)]
|
||||
pub struct Weights {
|
||||
spam: u32,
|
||||
ham: u32,
|
||||
pub spam: u32,
|
||||
pub ham: u32,
|
||||
}
|
||||
|
||||
impl BayesClassifier {
|
||||
|
|
@ -73,3 +76,32 @@ impl Default for BayesClassifier {
|
|||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Gram<'_>> for TokenHash {
|
||||
fn from(value: Gram<'_>) -> Self {
|
||||
match value {
|
||||
Gram::Uni { t1 } => TokenHash {
|
||||
h1: xxhash_rust::xxh3::xxh3_64(t1.as_bytes()),
|
||||
h2: farmhash::hash64(t1.as_bytes()),
|
||||
},
|
||||
Gram::Bi { t1, t2, .. } => {
|
||||
let mut buf = Vec::with_capacity(t1.len() + t2.len() + 1);
|
||||
buf.extend_from_slice(t1.as_bytes());
|
||||
buf.push(b' ');
|
||||
buf.extend_from_slice(t2.as_bytes());
|
||||
TokenHash {
|
||||
h1: xxhash_rust::xxh3::xxh3_64(&buf),
|
||||
h2: farmhash::hash64(&buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::hash::Hash for TokenHash {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
state.write_u64(self.h1 ^ self.h2);
|
||||
}
|
||||
}
|
||||
|
||||
impl nohash::IsEnabled for TokenHash {}
|
||||
|
|
|
|||
1227
crates/nlp/src/bayes/tokenize.rs
Normal file
1227
crates/nlp/src/bayes/tokenize.rs
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -21,7 +21,7 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use crate::transformers::osb::OsbToken;
|
||||
use crate::tokenizers::osb::OsbToken;
|
||||
|
||||
use super::{BayesModel, TokenHash};
|
||||
|
||||
|
|
|
|||
|
|
@ -21,6 +21,10 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
pub mod detect;
|
||||
pub mod stemmer;
|
||||
pub mod stopwords;
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
use crate::tokenizers::{
|
||||
|
|
@ -29,9 +33,6 @@ use crate::tokenizers::{
|
|||
|
||||
use self::detect::LanguageDetector;
|
||||
|
||||
pub mod detect;
|
||||
pub mod stemmer;
|
||||
|
||||
pub type LanguageTokenizer<'x> = Box<dyn Iterator<Item = Token<Cow<'x, str>>> + 'x>;
|
||||
|
||||
impl Language {
|
||||
|
|
@ -131,57 +132,9 @@ pub enum Language {
|
|||
|
||||
impl Language {
|
||||
pub fn from_iso_639(code: &str) -> Option<Self> {
|
||||
match code.split_once('-').map(|c| c.0).unwrap_or(code) {
|
||||
"en" => Language::English,
|
||||
"es" => Language::Spanish,
|
||||
"pt" => Language::Portuguese,
|
||||
"it" => Language::Italian,
|
||||
"fr" => Language::French,
|
||||
"de" => Language::German,
|
||||
"ru" => Language::Russian,
|
||||
"zh" => Language::Mandarin,
|
||||
"ja" => Language::Japanese,
|
||||
"ar" => Language::Arabic,
|
||||
"hi" => Language::Hindi,
|
||||
"ko" => Language::Korean,
|
||||
"bn" => Language::Bengali,
|
||||
"he" => Language::Hebrew,
|
||||
"ur" => Language::Urdu,
|
||||
"fa" => Language::Persian,
|
||||
"ml" => Language::Malayalam,
|
||||
"or" => Language::Oriya,
|
||||
"my" => Language::Burmese,
|
||||
"ne" => Language::Nepali,
|
||||
"si" => Language::Sinhalese,
|
||||
"km" => Language::Khmer,
|
||||
"tk" => Language::Turkmen,
|
||||
"am" => Language::Amharic,
|
||||
"az" => Language::Azerbaijani,
|
||||
"id" => Language::Indonesian,
|
||||
"te" => Language::Telugu,
|
||||
"ta" => Language::Tamil,
|
||||
"vi" => Language::Vietnamese,
|
||||
"gu" => Language::Gujarati,
|
||||
"pa" => Language::Punjabi,
|
||||
"uz" => Language::Uzbek,
|
||||
"hy" => Language::Armenian,
|
||||
"ka" => Language::Georgian,
|
||||
"la" => Language::Latin,
|
||||
"sl" => Language::Slovene,
|
||||
"hr" => Language::Croatian,
|
||||
"sr" => Language::Serbian,
|
||||
"mk" => Language::Macedonian,
|
||||
"lt" => Language::Lithuanian,
|
||||
"lv" => Language::Latvian,
|
||||
"et" => Language::Estonian,
|
||||
"tl" => Language::Tagalog,
|
||||
"af" => Language::Afrikaans,
|
||||
"zu" => Language::Zulu,
|
||||
"sn" => Language::Shona,
|
||||
"ak" => Language::Akan,
|
||||
_ => return None,
|
||||
}
|
||||
.into()
|
||||
LANG_ISO
|
||||
.get(code.split_once('-').map(|c| c.0).unwrap_or(code))
|
||||
.copied()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -200,3 +153,53 @@ impl Language {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
static LANG_ISO: phf::Map<&'static str, Language> = phf::phf_map! {
|
||||
"en" => Language::English,
|
||||
"es" => Language::Spanish,
|
||||
"pt" => Language::Portuguese,
|
||||
"it" => Language::Italian,
|
||||
"fr" => Language::French,
|
||||
"de" => Language::German,
|
||||
"ru" => Language::Russian,
|
||||
"zh" => Language::Mandarin,
|
||||
"ja" => Language::Japanese,
|
||||
"ar" => Language::Arabic,
|
||||
"hi" => Language::Hindi,
|
||||
"ko" => Language::Korean,
|
||||
"bn" => Language::Bengali,
|
||||
"he" => Language::Hebrew,
|
||||
"ur" => Language::Urdu,
|
||||
"fa" => Language::Persian,
|
||||
"ml" => Language::Malayalam,
|
||||
"or" => Language::Oriya,
|
||||
"my" => Language::Burmese,
|
||||
"ne" => Language::Nepali,
|
||||
"si" => Language::Sinhalese,
|
||||
"km" => Language::Khmer,
|
||||
"tk" => Language::Turkmen,
|
||||
"am" => Language::Amharic,
|
||||
"az" => Language::Azerbaijani,
|
||||
"id" => Language::Indonesian,
|
||||
"te" => Language::Telugu,
|
||||
"ta" => Language::Tamil,
|
||||
"vi" => Language::Vietnamese,
|
||||
"gu" => Language::Gujarati,
|
||||
"pa" => Language::Punjabi,
|
||||
"uz" => Language::Uzbek,
|
||||
"hy" => Language::Armenian,
|
||||
"ka" => Language::Georgian,
|
||||
"la" => Language::Latin,
|
||||
"sl" => Language::Slovene,
|
||||
"hr" => Language::Croatian,
|
||||
"sr" => Language::Serbian,
|
||||
"mk" => Language::Macedonian,
|
||||
"lt" => Language::Lithuanian,
|
||||
"lv" => Language::Latvian,
|
||||
"et" => Language::Estonian,
|
||||
"tl" => Language::Tagalog,
|
||||
"af" => Language::Afrikaans,
|
||||
"zu" => Language::Zulu,
|
||||
"sn" => Language::Shona,
|
||||
"ak" => Language::Akan,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ impl<'x> Iterator for Stemmer<'x> {
|
|||
}
|
||||
}
|
||||
|
||||
static STEMMER_MAP: &[Option<Algorithm>] = &[
|
||||
pub static STEMMER_MAP: &[Option<Algorithm>] = &[
|
||||
None, // Esperanto = 0,
|
||||
Some(Algorithm::English), // English = 1,
|
||||
Some(Algorithm::Russian), // Russian = 2,
|
||||
|
|
|
|||
4192
crates/nlp/src/language/stopwords.rs
Normal file
4192
crates/nlp/src/language/stopwords.rs
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,59 +1,52 @@
|
|||
use ahash::AHashSet;
|
||||
|
||||
pub mod bayes;
|
||||
pub mod language;
|
||||
pub mod tokenizers;
|
||||
pub mod transformers;
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PublicSuffix {
|
||||
pub suffixes: AHashSet<String>,
|
||||
pub exceptions: AHashSet<String>,
|
||||
pub wildcards: Vec<String>,
|
||||
}
|
||||
|
||||
impl PublicSuffix {
|
||||
pub fn contains(&self, suffix: &str) -> bool {
|
||||
self.suffixes.contains(suffix)
|
||||
|| (!self.exceptions.contains(suffix)
|
||||
&& self.wildcards.iter().any(|w| suffix.ends_with(w)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::fs;
|
||||
|
||||
use utils::suffixlist::PublicSuffix;
|
||||
|
||||
use crate::{
|
||||
bayes::{bloom::BloomHasher, BayesClassifier, BayesModel},
|
||||
transformers::osb::{OsbToken, OsbTokenizer},
|
||||
bayes::{tokenize::BayesTokenizer, BayesClassifier, BayesModel},
|
||||
tokenizers::osb::{OsbToken, OsbTokenizer},
|
||||
};
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn train() {
|
||||
let db = fs::read_to_string("spam_or_not_spam.csv").unwrap();
|
||||
let db =
|
||||
fs::read_to_string("/Users/me/code/mail-server/_ignore/spam_or_not_spam.csv").unwrap();
|
||||
let mut bayes = BayesModel::default();
|
||||
let suffixes = PublicSuffix::default();
|
||||
|
||||
for line in db.lines() {
|
||||
let (text, is_spam) = line.rsplit_once(',').unwrap();
|
||||
let is_spam = is_spam == "1";
|
||||
|
||||
bayes.train(
|
||||
BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)),
|
||||
OsbTokenizer::new(BayesTokenizer::new(text, &suffixes), 5),
|
||||
is_spam,
|
||||
);
|
||||
}
|
||||
println!("Ham: {} Spam: {}", bayes.ham_learns, bayes.spam_learns,);
|
||||
fs::write("spam_or_not_spam.bin", bincode::serialize(&bayes).unwrap()).unwrap();
|
||||
fs::write(
|
||||
"/Users/me/code/mail-server/_ignore/spam_or_not_spam.bin",
|
||||
bincode::serialize(&bayes).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn classify() {
|
||||
let model: BayesModel =
|
||||
bincode::deserialize(&fs::read("spam_or_not_spam.bin").unwrap()).unwrap();
|
||||
let model: BayesModel = bincode::deserialize(
|
||||
&fs::read("/Users/me/code/mail-server/_ignore/spam_or_not_spam.bin").unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let bayes = BayesClassifier::new();
|
||||
let suffixes = PublicSuffix::default();
|
||||
|
||||
for text in [
|
||||
"i am attaching to this email a presentation to integrate the spreadsheet into our server",
|
||||
|
|
@ -65,7 +58,7 @@ mod test {
|
|||
"{:?} -> {}",
|
||||
text,
|
||||
bayes
|
||||
.classify(BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)).filter_map(|x| model.weights.get(&x.inner).map(|w| {
|
||||
.classify(OsbTokenizer::new(BayesTokenizer::new(text, &suffixes), 5).filter_map(|x| model.weights.get(&x.inner).map(|w| {
|
||||
OsbToken {
|
||||
idx: x.idx,
|
||||
inner: *w,
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ use super::{InnerToken, Token};
|
|||
use lazy_static::lazy_static;
|
||||
|
||||
lazy_static! {
|
||||
static ref JIEBA: Jieba = Jieba::new();
|
||||
pub static ref JIEBA: Jieba = Jieba::new();
|
||||
}
|
||||
|
||||
pub struct ChineseTokenizer<'x, T, I>
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
pub mod chinese;
|
||||
pub mod japanese;
|
||||
pub mod osb;
|
||||
pub mod space;
|
||||
pub mod types;
|
||||
pub mod word;
|
||||
|
|
|
|||
358
crates/nlp/src/tokenizers/osb.rs
Normal file
358
crates/nlp/src/tokenizers/osb.rs
Normal file
|
|
@ -0,0 +1,358 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::{borrow::Cow, iter::Peekable};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct OsbToken<T> {
|
||||
pub inner: T,
|
||||
pub idx: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Gram<'x> {
|
||||
Uni { t1: &'x str },
|
||||
Bi { t1: &'x str, t2: &'x str },
|
||||
}
|
||||
|
||||
pub struct OsbTokenizer<'x, I, R>
|
||||
where
|
||||
I: Iterator<Item = Cow<'x, str>>,
|
||||
R: for<'y> From<Gram<'y>> + 'static,
|
||||
{
|
||||
iter: Peekable<I>,
|
||||
buf: Vec<Option<Cow<'x, str>>>,
|
||||
window_size: usize,
|
||||
window_pos: usize,
|
||||
window_idx: usize,
|
||||
phantom: std::marker::PhantomData<R>,
|
||||
}
|
||||
|
||||
impl<'x, I, R> OsbTokenizer<'x, I, R>
|
||||
where
|
||||
I: Iterator<Item = Cow<'x, str>>,
|
||||
R: for<'y> From<Gram<'y>> + 'static,
|
||||
{
|
||||
pub fn new(iter: I, window_size: usize) -> Self {
|
||||
Self {
|
||||
iter: iter.peekable(),
|
||||
buf: vec![None; window_size],
|
||||
window_pos: 0,
|
||||
window_idx: 0,
|
||||
window_size,
|
||||
phantom: std::marker::PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x, I, R> Iterator for OsbTokenizer<'x, I, R>
|
||||
where
|
||||
I: Iterator<Item = Cow<'x, str>>,
|
||||
R: for<'y> From<Gram<'y>> + 'static,
|
||||
{
|
||||
type Item = OsbToken<R>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let end_pos = (self.window_pos + self.window_idx) % self.window_size;
|
||||
if self.buf[end_pos].is_none() {
|
||||
self.buf[end_pos] = self.iter.next();
|
||||
}
|
||||
|
||||
let t1 = self.buf[self.window_pos % self.window_size].as_deref()?;
|
||||
let token = OsbToken {
|
||||
inner: R::from(if self.window_idx != 0 {
|
||||
Gram::Bi {
|
||||
t1,
|
||||
t2: self.buf[end_pos].as_deref()?,
|
||||
}
|
||||
} else {
|
||||
Gram::Uni { t1 }
|
||||
}),
|
||||
idx: self.window_idx,
|
||||
};
|
||||
|
||||
// Increment window index
|
||||
self.window_idx += 1;
|
||||
if self.window_idx == self.window_size
|
||||
|| (self.iter.peek().is_none()
|
||||
&& self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none())
|
||||
{
|
||||
self.buf[self.window_pos % self.window_size] = None;
|
||||
self.window_idx = 0;
|
||||
self.window_pos += 1;
|
||||
}
|
||||
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::borrow::Cow;
|
||||
|
||||
use crate::tokenizers::osb::{Gram, OsbToken};
|
||||
|
||||
impl From<Gram<'_>> for String {
|
||||
fn from(value: Gram<'_>) -> Self {
|
||||
match value {
|
||||
Gram::Uni { t1 } => t1.to_string(),
|
||||
Gram::Bi { t1, t2 } => format!("{t1} {t2}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn osb_tokenizer() {
|
||||
assert_eq!(
|
||||
super::OsbTokenizer::new(
|
||||
"The quick brown fox jumps over the lazy dog and the lazy cat"
|
||||
.split_ascii_whitespace()
|
||||
.map(Cow::from),
|
||||
5,
|
||||
)
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
OsbToken {
|
||||
inner: "The".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "The quick".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "The brown".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "The fox".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "The jumps".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "quick".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "quick brown".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "quick fox".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "quick jumps".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "quick over".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "brown".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "brown fox".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "brown jumps".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "brown over".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "brown the".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "fox".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "fox jumps".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "fox over".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "fox the".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "fox lazy".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "jumps".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "jumps over".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "jumps the".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "jumps lazy".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "jumps dog".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "over".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "over the".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "over lazy".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "over dog".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "over and".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "the".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "the lazy".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "the dog".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "the and".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "the the".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "lazy".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "lazy dog".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "lazy and".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "lazy the".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "lazy lazy".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "dog".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "dog and".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "dog the".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "dog lazy".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "dog cat".to_string(),
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: "and".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "and the".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "and lazy".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "and cat".to_string(),
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: "the".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "the lazy".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "the cat".to_string(),
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: "lazy".to_string(),
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: "lazy cat".to_string(),
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: "cat".to_string(),
|
||||
idx: 0
|
||||
}
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
use std::str::CharIndices;
|
||||
|
||||
use crate::PublicSuffix;
|
||||
use utils::suffixlist::PublicSuffix;
|
||||
|
||||
use super::Token;
|
||||
|
||||
|
|
@ -31,35 +31,39 @@ pub struct TypesTokenizer<'x, 'y> {
|
|||
text: &'x str,
|
||||
suffixes: &'y PublicSuffix,
|
||||
iter: CharIndices<'x>,
|
||||
tokens: Vec<Token<TokenType<'x>>>,
|
||||
tokens: Vec<Token<TokenType<&'x str>>>,
|
||||
peek_pos: usize,
|
||||
last_ch_is_space: bool,
|
||||
last_token_is_dot: bool,
|
||||
eof: bool,
|
||||
tokenize_urls: bool,
|
||||
tokenize_urls_without_scheme: bool,
|
||||
tokenize_emails: bool,
|
||||
tokenize_numbers: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TokenType<'x> {
|
||||
Alphabetic(&'x str),
|
||||
Integer(&'x str),
|
||||
Alphanumeric(&'x str),
|
||||
Hexadecimal(&'x str),
|
||||
pub enum TokenType<T> {
|
||||
Alphabetic(T),
|
||||
Integer(T),
|
||||
Alphanumeric(T),
|
||||
Hexadecimal(T),
|
||||
Other(char),
|
||||
Punctuation(char),
|
||||
Space,
|
||||
|
||||
// Detected types
|
||||
Url(&'x str),
|
||||
UrlNoScheme(&'x str),
|
||||
UrlNoHost(&'x str),
|
||||
Email(&'x str),
|
||||
Float(&'x str),
|
||||
Url(T),
|
||||
UrlNoScheme(T),
|
||||
UrlNoHost(T),
|
||||
Email(T),
|
||||
Float(T),
|
||||
}
|
||||
|
||||
impl Copy for Token<TokenType<'_>> {}
|
||||
impl Copy for Token<TokenType<&'_ str>> {}
|
||||
|
||||
impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
|
||||
type Item = Token<TokenType<'x>>;
|
||||
type Item = Token<TokenType<&'x str>>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let token = self.peek()?;
|
||||
|
|
@ -67,7 +71,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
|
|||
self.last_token_is_dot = matches!(token.word, TokenType::Punctuation('.'));
|
||||
|
||||
// Try parsing URL with scheme
|
||||
if matches!(
|
||||
if self.tokenize_urls
|
||||
&& matches!(
|
||||
token.word,
|
||||
TokenType::Alphabetic(t) | TokenType::Hexadecimal(t)
|
||||
if t.len() <= 8 && t.chars().all(|c| c.is_ascii()))
|
||||
|
|
@ -82,7 +87,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
|
|||
}
|
||||
|
||||
// Try parsing email
|
||||
if token.word.is_email_atom()
|
||||
if self.tokenize_emails
|
||||
&& token.word.is_email_atom()
|
||||
&& self.peek_has_tokens(
|
||||
&[TokenType::Punctuation('@'), TokenType::Punctuation('.')],
|
||||
TokenType::Space,
|
||||
|
|
@ -97,7 +103,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
|
|||
}
|
||||
|
||||
// Try parsing URL without scheme
|
||||
if token.word.is_domain_atom(true)
|
||||
if self.tokenize_urls_without_scheme
|
||||
&& token.word.is_domain_atom(true)
|
||||
&& self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space)
|
||||
{
|
||||
if let Some(url) = self.try_parse_url(None) {
|
||||
|
|
@ -109,7 +116,7 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
|
|||
}
|
||||
|
||||
// Try parsing currencies and floating point numbers
|
||||
if !last_is_dot {
|
||||
if self.tokenize_numbers && !last_is_dot {
|
||||
if let Some(num) = self.try_parse_number() {
|
||||
self.peek_advance();
|
||||
return Some(num);
|
||||
|
|
@ -132,9 +139,33 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
|
|||
suffixes,
|
||||
last_ch_is_space: false,
|
||||
last_token_is_dot: false,
|
||||
tokenize_urls: true,
|
||||
tokenize_urls_without_scheme: true,
|
||||
tokenize_emails: true,
|
||||
tokenize_numbers: true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tokenize_urls(mut self, tokenize: bool) -> Self {
|
||||
self.tokenize_urls = tokenize;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn tokenize_urls_without_scheme(mut self, tokenize: bool) -> Self {
|
||||
self.tokenize_urls_without_scheme = tokenize;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn tokenize_emails(mut self, tokenize: bool) -> Self {
|
||||
self.tokenize_emails = tokenize;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn tokenize_numbers(mut self, tokenize: bool) -> Self {
|
||||
self.tokenize_numbers = tokenize;
|
||||
self
|
||||
}
|
||||
|
||||
fn consume(&mut self) -> bool {
|
||||
let mut has_alpha = false;
|
||||
let mut has_number = false;
|
||||
|
|
@ -212,7 +243,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
|
|||
}
|
||||
}
|
||||
|
||||
fn next_(&mut self) -> Option<Token<TokenType<'x>>> {
|
||||
fn next_(&mut self) -> Option<Token<TokenType<&'x str>>> {
|
||||
if self.tokens.is_empty() && !self.eof {
|
||||
self.consume();
|
||||
}
|
||||
|
|
@ -223,7 +254,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
|
|||
}
|
||||
}
|
||||
|
||||
fn peek(&mut self) -> Option<Token<TokenType<'x>>> {
|
||||
fn peek(&mut self) -> Option<Token<TokenType<&'x str>>> {
|
||||
while self.tokens.len() <= self.peek_pos && !self.eof {
|
||||
self.consume();
|
||||
}
|
||||
|
|
@ -244,7 +275,11 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
|
|||
self.peek_pos = 0;
|
||||
}
|
||||
|
||||
fn peek_has_tokens(&mut self, tokens: &[TokenType<'_>], stop_token: TokenType<'_>) -> bool {
|
||||
fn peek_has_tokens(
|
||||
&mut self,
|
||||
tokens: &[TokenType<&'_ str>],
|
||||
stop_token: TokenType<&'_ str>,
|
||||
) -> bool {
|
||||
let mut tokens = tokens.iter().copied();
|
||||
let mut token = tokens.next().unwrap();
|
||||
while let Some(t) = self.peek() {
|
||||
|
|
@ -266,8 +301,8 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
|
|||
|
||||
fn try_parse_url(
|
||||
&mut self,
|
||||
scheme_token: Option<Token<TokenType<'_>>>,
|
||||
) -> Option<Token<TokenType<'x>>> {
|
||||
scheme_token: Option<Token<TokenType<&'_ str>>>,
|
||||
) -> Option<Token<TokenType<&'x str>>> {
|
||||
let (has_scheme, allow_blank_host) = scheme_token.as_ref().map_or((false, false), |t| {
|
||||
(
|
||||
true,
|
||||
|
|
@ -480,7 +515,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
|
|||
.into()
|
||||
}
|
||||
|
||||
fn try_parse_email(&mut self) -> Option<Token<TokenType<'x>>> {
|
||||
fn try_parse_email(&mut self) -> Option<Token<TokenType<&'x str>>> {
|
||||
// Start token is a valid local part atom
|
||||
let start_token = self.peek()?;
|
||||
let mut last_is_dot = false;
|
||||
|
|
@ -615,7 +650,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
|
|||
None
|
||||
}
|
||||
|
||||
fn try_parse_number(&mut self) -> Option<Token<TokenType<'x>>> {
|
||||
fn try_parse_number(&mut self) -> Option<Token<TokenType<&'x str>>> {
|
||||
self.peek_rewind();
|
||||
let mut start_pos = usize::MAX;
|
||||
let mut end_pos = usize::MAX;
|
||||
|
|
@ -698,7 +733,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'x> TokenType<'x> {
|
||||
impl<T> TokenType<T> {
|
||||
fn is_email_atom(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
|
|
@ -744,7 +779,8 @@ impl<'x> TokenType<'x> {
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::PublicSuffix;
|
||||
|
||||
use utils::suffixlist::PublicSuffix;
|
||||
|
||||
use super::{TokenType, TypesTokenizer};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,24 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
pub mod osb;
|
||||
|
|
@ -1,467 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::iter::Peekable;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct OsbToken<T> {
|
||||
pub inner: T,
|
||||
pub idx: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Gram<'x> {
|
||||
Uni { t1: &'x str },
|
||||
Bi { t1: &'x str, t2: &'x str },
|
||||
}
|
||||
|
||||
pub struct OsbTokenizer<'x, I>
|
||||
where
|
||||
I: Iterator<Item = &'x str>,
|
||||
{
|
||||
iter: Peekable<I>,
|
||||
buf: Vec<Option<&'x str>>,
|
||||
window_size: usize,
|
||||
window_pos: usize,
|
||||
window_idx: usize,
|
||||
}
|
||||
|
||||
impl<'x, I> OsbTokenizer<'x, I>
|
||||
where
|
||||
I: Iterator<Item = &'x str>,
|
||||
{
|
||||
pub fn new(iter: I, window_size: usize) -> Self {
|
||||
Self {
|
||||
iter: iter.peekable(),
|
||||
buf: vec![None; window_size],
|
||||
window_pos: 0,
|
||||
window_idx: 0,
|
||||
window_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x, I> Iterator for OsbTokenizer<'x, I>
|
||||
where
|
||||
I: Iterator<Item = &'x str>,
|
||||
{
|
||||
type Item = OsbToken<Gram<'x>>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let end_pos = (self.window_pos + self.window_idx) % self.window_size;
|
||||
if self.buf[end_pos].is_none() {
|
||||
self.buf[end_pos] = self.iter.next();
|
||||
}
|
||||
|
||||
let t1 = self.buf[self.window_pos % self.window_size]?;
|
||||
let token = OsbToken {
|
||||
inner: if self.window_idx != 0 {
|
||||
Gram::Bi {
|
||||
t1,
|
||||
t2: self.buf[end_pos]?,
|
||||
}
|
||||
} else {
|
||||
Gram::Uni { t1 }
|
||||
},
|
||||
idx: self.window_idx,
|
||||
};
|
||||
|
||||
// Increment window
|
||||
self.window_idx += 1;
|
||||
if self.window_idx == self.window_size
|
||||
|| (self.iter.peek().is_none()
|
||||
&& self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none())
|
||||
{
|
||||
self.buf[self.window_pos % self.window_size] = None;
|
||||
self.window_idx = 0;
|
||||
self.window_pos += 1;
|
||||
}
|
||||
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::transformers::osb::{Gram, OsbToken};
|
||||
|
||||
#[test]
|
||||
fn osb_tokenizer() {
|
||||
assert_eq!(
|
||||
super::OsbTokenizer::new(
|
||||
"The quick brown fox jumps over the lazy dog and the lazy cat"
|
||||
.split_ascii_whitespace(),
|
||||
5
|
||||
)
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "The" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "The",
|
||||
t2: "quick"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "The",
|
||||
t2: "brown"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "The",
|
||||
t2: "fox"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "The",
|
||||
t2: "jumps"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "quick" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "quick",
|
||||
t2: "brown"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "quick",
|
||||
t2: "fox"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "quick",
|
||||
t2: "jumps"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "quick",
|
||||
t2: "over"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "brown" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "brown",
|
||||
t2: "fox"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "brown",
|
||||
t2: "jumps"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "brown",
|
||||
t2: "over"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "brown",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "fox" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "fox",
|
||||
t2: "jumps"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "fox",
|
||||
t2: "over"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "fox",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "fox",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "jumps" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "jumps",
|
||||
t2: "over"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "jumps",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "jumps",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "jumps",
|
||||
t2: "dog"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "over" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "over",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "over",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "over",
|
||||
t2: "dog"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "over",
|
||||
t2: "and"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "the" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "dog"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "and"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "lazy" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "dog"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "and"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "dog" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "dog",
|
||||
t2: "and"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "dog",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "dog",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "dog",
|
||||
t2: "cat"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "and" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "and",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "and",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "and",
|
||||
t2: "cat"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "the" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "cat"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "lazy" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "cat"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "cat" },
|
||||
idx: 0
|
||||
}
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -13,6 +13,7 @@ resolver = "2"
|
|||
|
||||
[dependencies]
|
||||
utils = { path = "../utils" }
|
||||
nlp = { path = "../nlp" }
|
||||
directory = { path = "../directory" }
|
||||
mail-auth = { git = "https://github.com/stalwartlabs/mail-auth" }
|
||||
mail-send = { git = "https://github.com/stalwartlabs/mail-send", default-features = false, features = ["cram-md5", "skip-ehlo"] }
|
||||
|
|
@ -50,7 +51,6 @@ num_cpus = "1.15.0"
|
|||
lazy_static = "1.4"
|
||||
whatlang = "0.16"
|
||||
imagesize = "0.12"
|
||||
linkify = "0.10"
|
||||
idna = "0.4"
|
||||
decancer = "1.6.1"
|
||||
unicode-security = "0.1.0"
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ use std::{
|
|||
time::Duration,
|
||||
};
|
||||
|
||||
use ahash::{AHashMap, AHashSet};
|
||||
use ahash::AHashMap;
|
||||
use directory::{Directory, DirectoryConfig, Lookup};
|
||||
use mail_auth::{
|
||||
common::crypto::{Ed25519Key, RsaKey, Sha256},
|
||||
|
|
@ -541,13 +541,6 @@ pub enum VerifyStrategy {
|
|||
Disable,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PublicSuffix {
|
||||
pub suffixes: AHashSet<String>,
|
||||
pub exceptions: AHashSet<String>,
|
||||
pub wildcards: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ConfigContext<'x> {
|
||||
pub servers: &'x [Server],
|
||||
|
|
|
|||
|
|
@ -34,9 +34,7 @@ use mail_auth::{
|
|||
};
|
||||
|
||||
use crate::{core::Resolvers, outbound::dane::DnssecResolver};
|
||||
use utils::config::Config;
|
||||
|
||||
use super::PublicSuffix;
|
||||
use utils::{config::Config, suffixlist::PublicSuffix};
|
||||
|
||||
pub trait ConfigResolver {
|
||||
fn build_resolvers(&self) -> super::Result<Resolvers>;
|
||||
|
|
@ -108,9 +106,9 @@ impl ConfigResolver for Config {
|
|||
}
|
||||
|
||||
fn parse_public_suffix(&self) -> super::Result<PublicSuffix> {
|
||||
let mut ps = PublicSuffix::default();
|
||||
|
||||
let mut has_values = false;
|
||||
for (_, value) in self.values("resolver.public-suffix") {
|
||||
has_values = true;
|
||||
let bytes = if value.starts_with("https://") || value.starts_with("http://") {
|
||||
match tokio::task::block_in_place(|| {
|
||||
reqwest::blocking::get(value).and_then(|r| {
|
||||
|
|
@ -175,20 +173,7 @@ impl ConfigResolver for Config {
|
|||
|
||||
match String::from_utf8(bytes) {
|
||||
Ok(list) => {
|
||||
for line in list.lines() {
|
||||
let line = line.trim().to_lowercase();
|
||||
if !line.starts_with("//") {
|
||||
if let Some(domain) = line.strip_prefix('*') {
|
||||
ps.wildcards.push(domain.to_string());
|
||||
} else if let Some(domain) = line.strip_prefix('!') {
|
||||
ps.exceptions.insert(domain.to_string());
|
||||
} else {
|
||||
ps.suffixes.insert(line.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(ps);
|
||||
return Ok(PublicSuffix::from(list.as_str()));
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::warn!(
|
||||
|
|
@ -200,16 +185,10 @@ impl ConfigResolver for Config {
|
|||
}
|
||||
}
|
||||
|
||||
tracing::warn!("Failed to parse public suffixes from any source.");
|
||||
if has_values {
|
||||
tracing::warn!("Failed to parse public suffixes from any source.");
|
||||
}
|
||||
|
||||
Ok(ps)
|
||||
}
|
||||
}
|
||||
|
||||
impl PublicSuffix {
|
||||
pub fn contains(&self, suffix: &str) -> bool {
|
||||
self.suffixes.contains(suffix)
|
||||
|| (!self.exceptions.contains(suffix)
|
||||
&& self.wildcards.iter().any(|w| suffix.ends_with(w)))
|
||||
Ok(PublicSuffix::default())
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,25 +21,33 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use std::time::Duration;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use directory::Lookup;
|
||||
use nlp::bayes::{cache::BayesTokenCache, BayesClassifier};
|
||||
use sieve::{compiler::grammar::Capability, Compiler, Runtime};
|
||||
|
||||
use crate::{
|
||||
core::{SieveConfig, SieveCore},
|
||||
scripts::{functions::register_functions, plugins::RegisterSievePlugins},
|
||||
};
|
||||
use utils::config::{utils::AsKey, Config};
|
||||
use utils::{
|
||||
config::{utils::AsKey, Config},
|
||||
suffixlist::PublicSuffix,
|
||||
};
|
||||
|
||||
use super::{resolver::ConfigResolver, ConfigContext, PublicSuffix};
|
||||
use super::{resolver::ConfigResolver, ConfigContext};
|
||||
|
||||
pub trait ConfigSieve {
|
||||
fn parse_sieve(&self, ctx: &mut ConfigContext) -> super::Result<SieveCore>;
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct SieveContext {
|
||||
pub psl: PublicSuffix,
|
||||
pub bayes_classify: BayesClassifier,
|
||||
pub bayes_cache: BayesTokenCache,
|
||||
pub lookup_classify: Arc<Lookup>,
|
||||
pub lookup_train: Arc<Lookup>,
|
||||
}
|
||||
|
||||
impl ConfigSieve for Config {
|
||||
|
|
@ -48,6 +56,29 @@ impl ConfigSieve for Config {
|
|||
let mut fnc_map = register_functions().register_plugins();
|
||||
let sieve_ctx = SieveContext {
|
||||
psl: self.parse_public_suffix()?,
|
||||
bayes_classify: BayesClassifier {
|
||||
min_token_hits: self.property_or_static("bayes.min-token-hits", "2")?,
|
||||
min_tokens: self.property_or_static("bayes.min-tokens", "11")?,
|
||||
min_prob_strength: self.property_or_static("bayes.min-prob-strength", "0.05")?,
|
||||
min_learns: self.property_or_static("bayes.min-learns", "200")?,
|
||||
},
|
||||
bayes_cache: BayesTokenCache::new(
|
||||
self.property_or_static("bayes.cache.capacity", "8192")?,
|
||||
self.property_or_static("bayes.cache.ttl.positive", "1h")?,
|
||||
self.property_or_static("bayes.cache.ttl.negative", "1h")?,
|
||||
),
|
||||
lookup_classify: ctx
|
||||
.directory
|
||||
.lookups
|
||||
.get("bayes.tokens.classify")
|
||||
.ok_or("No lookup found for key bayes.tokens.classify.".to_string())?
|
||||
.clone(),
|
||||
lookup_train: ctx
|
||||
.directory
|
||||
.lookups
|
||||
.get("bayes.tokens.train")
|
||||
.ok_or("No lookup found for key bayes.tokens.train.".to_string())?
|
||||
.clone(),
|
||||
};
|
||||
|
||||
// Allocate compiler and runtime
|
||||
|
|
|
|||
|
|
@ -24,7 +24,6 @@
|
|||
use core::panic;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use ahash::AHashMap;
|
||||
use directory::Lookup;
|
||||
use mail_auth::common::headers::HeaderWriter;
|
||||
use sieve::{
|
||||
|
|
@ -68,8 +67,6 @@ impl SMTP {
|
|||
let mut modifications = vec![];
|
||||
let mut keep_id = usize::MAX;
|
||||
|
||||
let mut plugin_data = AHashMap::new();
|
||||
|
||||
// Start event loop
|
||||
while let Some(result) = instance.run(input) {
|
||||
match result {
|
||||
|
|
@ -125,7 +122,6 @@ impl SMTP {
|
|||
span: &span,
|
||||
handle: &handle,
|
||||
core: self,
|
||||
data: &mut plugin_data,
|
||||
message: instance.message(),
|
||||
arguments,
|
||||
},
|
||||
|
|
|
|||
|
|
@ -21,11 +21,12 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use nlp::tokenizers::types::{TokenType, TypesTokenizer};
|
||||
use sieve::{runtime::Variable, Context};
|
||||
|
||||
use crate::{config::scripts::SieveContext, scripts::functions::url::tokenize_email};
|
||||
use crate::config::scripts::SieveContext;
|
||||
|
||||
use super::{html::html_to_tokens, url::tokenize_url, ApplyString};
|
||||
use super::{html::html_to_tokens, ApplyString};
|
||||
|
||||
pub fn fn_trim<'x>(_: &'x Context<'x, SieveContext>, v: Vec<Variable<'x>>) -> Variable<'x> {
|
||||
v[0].transform(|s| Variable::StringRef(s.trim()))
|
||||
|
|
@ -106,13 +107,49 @@ pub fn fn_tokenize<'x>(
|
|||
ctx: &'x Context<'x, SieveContext>,
|
||||
mut v: Vec<Variable<'x>>,
|
||||
) -> Variable<'x> {
|
||||
match v[1].to_cow().as_ref() {
|
||||
"html" => html_to_tokens(v[0].to_cow().as_ref()).into(),
|
||||
"words" => tokenize_words(&v[0]),
|
||||
"uri" | "url" => tokenize_url(ctx, v.remove(0), false),
|
||||
"uri_strict" | "url_strict" => tokenize_url(ctx, v.remove(0), true),
|
||||
"email" => tokenize_email(v.remove(0)),
|
||||
_ => Variable::default(),
|
||||
let (urls, urls_without_scheme, emails) = match v[1].to_cow().as_ref() {
|
||||
"html" => return html_to_tokens(v[0].to_cow().as_ref()).into(),
|
||||
"words" => return tokenize_words(&v[0]),
|
||||
"uri" | "url" => (true, true, true),
|
||||
"uri_strict" | "url_strict" => (true, false, false),
|
||||
"email" => (false, false, true),
|
||||
_ => return Variable::default(),
|
||||
};
|
||||
|
||||
match v.remove(0) {
|
||||
Variable::StringRef(text) => TypesTokenizer::new(text, &ctx.context().psl)
|
||||
.tokenize_numbers(false)
|
||||
.tokenize_urls(urls)
|
||||
.tokenize_urls_without_scheme(urls_without_scheme)
|
||||
.tokenize_emails(emails)
|
||||
.filter_map(|t| match t.word {
|
||||
TokenType::Url(text) if urls => Variable::StringRef(text).into(),
|
||||
TokenType::UrlNoScheme(text) if urls_without_scheme => {
|
||||
Variable::String(format!("https://{text}")).into()
|
||||
}
|
||||
TokenType::Email(text) if emails => Variable::StringRef(text).into(),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into(),
|
||||
v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => {
|
||||
TypesTokenizer::new(v.to_cow().as_ref(), &ctx.context().psl)
|
||||
.tokenize_numbers(false)
|
||||
.tokenize_urls(urls)
|
||||
.tokenize_urls_without_scheme(urls_without_scheme)
|
||||
.tokenize_emails(emails)
|
||||
.filter_map(|t| match t.word {
|
||||
TokenType::Url(text) if urls => Variable::String(text.to_string()).into(),
|
||||
TokenType::UrlNoScheme(text) if urls_without_scheme => {
|
||||
Variable::String(format!("https://{text}")).into()
|
||||
}
|
||||
TokenType::Email(text) if emails => Variable::String(text.to_string()).into(),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into()
|
||||
}
|
||||
v => v,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,94 +21,13 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use std::net::IpAddr;
|
||||
|
||||
use hyper::Uri;
|
||||
use linkify::LinkKind;
|
||||
use sieve::{runtime::Variable, Context};
|
||||
|
||||
use crate::config::scripts::SieveContext;
|
||||
|
||||
use super::ApplyString;
|
||||
|
||||
pub fn tokenize_url<'x>(
|
||||
ctx: &'x Context<'x, SieveContext>,
|
||||
v: Variable<'x>,
|
||||
must_have_scheme: bool,
|
||||
) -> Variable<'x> {
|
||||
match v {
|
||||
Variable::StringRef(text) => linkify::LinkFinder::new()
|
||||
.url_must_have_scheme(must_have_scheme)
|
||||
.kinds(&[LinkKind::Url])
|
||||
.links(text.as_ref())
|
||||
.filter_map(|url| filter_url(url.as_str(), must_have_scheme, ctx))
|
||||
.collect::<Vec<_>>()
|
||||
.into(),
|
||||
v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => {
|
||||
linkify::LinkFinder::new()
|
||||
.url_must_have_scheme(must_have_scheme)
|
||||
.kinds(&[LinkKind::Url])
|
||||
.links(v.to_cow().as_ref())
|
||||
.filter_map(|url| {
|
||||
filter_url(url.as_str(), must_have_scheme, ctx).map(|v| v.into_owned())
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into()
|
||||
}
|
||||
v => v,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tokenize_email(v: Variable<'_>) -> Variable<'_> {
|
||||
match v {
|
||||
Variable::StringRef(text) => linkify::LinkFinder::new()
|
||||
.email_domain_must_have_dot(true)
|
||||
.kinds(&[LinkKind::Email])
|
||||
.links(text.as_ref())
|
||||
.map(|email| Variable::StringRef(email.as_str()))
|
||||
.collect::<Vec<_>>()
|
||||
.into(),
|
||||
v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => {
|
||||
linkify::LinkFinder::new()
|
||||
.email_domain_must_have_dot(true)
|
||||
.kinds(&[LinkKind::Email])
|
||||
.links(v.to_cow().as_ref())
|
||||
.map(|email| Variable::String(email.as_str().to_string()))
|
||||
.collect::<Vec<_>>()
|
||||
.into()
|
||||
}
|
||||
v => v,
|
||||
}
|
||||
}
|
||||
|
||||
fn filter_url<'x, 'y>(
|
||||
url: &'x str,
|
||||
must_have_scheme: bool,
|
||||
ctx: &'y Context<'y, SieveContext>,
|
||||
) -> Option<Variable<'x>> {
|
||||
if must_have_scheme || url.contains("://") {
|
||||
Some(Variable::StringRef(url))
|
||||
} else {
|
||||
// Filter out possible URLs without a valid TLD
|
||||
let host = url.split_once('/').map_or(url, |(f, _)| f);
|
||||
if (host
|
||||
.as_bytes()
|
||||
.first()
|
||||
.map_or(true, |ch| ch.is_ascii_hexdigit())
|
||||
&& host.parse::<IpAddr>().is_ok())
|
||||
|| ctx
|
||||
.context()
|
||||
.psl
|
||||
.contains(host.rsplit_once('.').map_or(host, |(_, tld)| tld))
|
||||
|| host.ends_with(".onion")
|
||||
{
|
||||
Some(Variable::String(format!("https://{url}")))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fn_uri_part<'x>(_: &'x Context<'x, SieveContext>, v: Vec<Variable<'x>>) -> Variable<'x> {
|
||||
let part = v[1].to_cow();
|
||||
v[0].transform(|uri| {
|
||||
|
|
|
|||
206
crates/smtp/src/scripts/plugins/bayes.rs
Normal file
206
crates/smtp/src/scripts/plugins/bayes.rs
Normal file
|
|
@ -0,0 +1,206 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use directory::{DatabaseColumn, Lookup};
|
||||
use nlp::{
|
||||
bayes::{cache::BayesTokenCache, tokenize::BayesTokenizer, BayesModel, TokenHash, Weights},
|
||||
tokenizers::osb::{OsbToken, OsbTokenizer},
|
||||
};
|
||||
use sieve::{runtime::Variable, FunctionMap};
|
||||
use tokio::runtime::Handle;
|
||||
|
||||
use crate::config::scripts::SieveContext;
|
||||
|
||||
use super::PluginContext;
|
||||
|
||||
pub fn register_train(plugin_id: u32, fnc_map: &mut FunctionMap<SieveContext>) {
|
||||
fnc_map.set_external_function("bayes_train", plugin_id, 2);
|
||||
}
|
||||
|
||||
pub fn register_untrain(plugin_id: u32, fnc_map: &mut FunctionMap<SieveContext>) {
|
||||
fnc_map.set_external_function("bayes_untrain", plugin_id, 2);
|
||||
}
|
||||
|
||||
pub fn register_classify(plugin_id: u32, fnc_map: &mut FunctionMap<SieveContext>) {
|
||||
fnc_map.set_external_function("bayes_classify", plugin_id, 1);
|
||||
}
|
||||
|
||||
pub fn exec_train(ctx: PluginContext<'_>) -> Variable<'static> {
|
||||
train(ctx, true)
|
||||
}
|
||||
|
||||
pub fn exec_untrain(ctx: PluginContext<'_>) -> Variable<'static> {
|
||||
train(ctx, false)
|
||||
}
|
||||
|
||||
fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable<'static> {
|
||||
let mut arguments = ctx.arguments.into_iter();
|
||||
let text = arguments.next().unwrap().into_string();
|
||||
if text.is_empty() {
|
||||
return false.into();
|
||||
}
|
||||
let handle = ctx.handle;
|
||||
let ctx = ctx.core.sieve.runtime.context();
|
||||
|
||||
// Train the model
|
||||
let is_spam = arguments.next().unwrap().to_bool();
|
||||
let mut model = BayesModel::default();
|
||||
model.train(
|
||||
OsbTokenizer::new(BayesTokenizer::new(text.as_ref(), &ctx.psl), 5),
|
||||
is_spam,
|
||||
);
|
||||
if model.weights.is_empty() {
|
||||
return false.into();
|
||||
}
|
||||
|
||||
// Update weight and invalidate cache
|
||||
let upsert = &ctx.lookup_train;
|
||||
for (hash, weights) in model.weights {
|
||||
let (s_weight, h_weight) = if is_train {
|
||||
(weights.spam as i64, weights.ham as i64)
|
||||
} else {
|
||||
(-(weights.spam as i64), -(weights.ham as i64))
|
||||
};
|
||||
if handle
|
||||
.block_on(upsert.lookup(&[
|
||||
hash.h1.into(),
|
||||
hash.h2.into(),
|
||||
s_weight.into(),
|
||||
h_weight.into(),
|
||||
]))
|
||||
.is_none()
|
||||
{
|
||||
return false.into();
|
||||
}
|
||||
ctx.bayes_cache.invalidate(&hash);
|
||||
}
|
||||
|
||||
// Update training counts
|
||||
let train_val = if is_train { 1i64 } else { -1i64 };
|
||||
let (spam_count, ham_count) = if is_spam {
|
||||
(train_val, 0i64)
|
||||
} else {
|
||||
(0i64, train_val)
|
||||
};
|
||||
if handle
|
||||
.block_on(upsert.query(&[
|
||||
0i64.into(),
|
||||
0i64.into(),
|
||||
spam_count.into(),
|
||||
ham_count.into(),
|
||||
]))
|
||||
.is_none()
|
||||
{
|
||||
return false.into();
|
||||
}
|
||||
ctx.bayes_cache.invalidate(&TokenHash::default());
|
||||
|
||||
true.into()
|
||||
}
|
||||
|
||||
pub fn exec_classify(ctx: PluginContext<'_>) -> Variable<'static> {
|
||||
let mut arguments = ctx.arguments.into_iter();
|
||||
let text = arguments.next().unwrap().into_string();
|
||||
if text.is_empty() {
|
||||
return 0.into();
|
||||
}
|
||||
let handle = ctx.handle;
|
||||
let ctx = ctx.core.sieve.runtime.context();
|
||||
let get_token = &ctx.lookup_classify;
|
||||
|
||||
// Obtain training counts
|
||||
let (spam_learns, ham_learns) = if let Some(weights) =
|
||||
ctx.bayes_cache
|
||||
.get_or_update(TokenHash::default(), handle, get_token)
|
||||
{
|
||||
(weights.spam, weights.ham)
|
||||
} else {
|
||||
return 0.into();
|
||||
};
|
||||
|
||||
// Make sure we have enough training data
|
||||
if spam_learns < ctx.bayes_classify.min_learns || ham_learns < ctx.bayes_classify.min_learns {
|
||||
return 0.into();
|
||||
}
|
||||
|
||||
// Classify the text
|
||||
ctx.bayes_classify
|
||||
.classify(
|
||||
OsbTokenizer::<_, TokenHash>::new(BayesTokenizer::new(text.as_ref(), &ctx.psl), 5)
|
||||
.filter_map(|t| {
|
||||
OsbToken {
|
||||
inner: ctx.bayes_cache.get_or_update(t.inner, handle, get_token)?,
|
||||
idx: t.idx,
|
||||
}
|
||||
.into()
|
||||
}),
|
||||
ham_learns,
|
||||
spam_learns,
|
||||
)
|
||||
.unwrap_or_default()
|
||||
.into()
|
||||
}
|
||||
|
||||
trait LookupOrInsert {
|
||||
fn get_or_update(
|
||||
&self,
|
||||
hash: TokenHash,
|
||||
handle: &Handle,
|
||||
get_token: &Lookup,
|
||||
) -> Option<Weights>;
|
||||
}
|
||||
|
||||
impl LookupOrInsert for BayesTokenCache {
|
||||
fn get_or_update(
|
||||
&self,
|
||||
hash: TokenHash,
|
||||
handle: &Handle,
|
||||
get_token: &Lookup,
|
||||
) -> Option<Weights> {
|
||||
if let Some(weights) = self.get(&hash) {
|
||||
weights.unwrap_or_default().into()
|
||||
} else if let Some(result) =
|
||||
handle.block_on(get_token.query(&[hash.h1.into(), hash.h2.into()]))
|
||||
{
|
||||
let mut result = result.into_iter();
|
||||
match (result.next(), result.next()) {
|
||||
(Some(DatabaseColumn::Integer(spam)), Some(DatabaseColumn::Integer(ham))) => {
|
||||
let weights = Weights {
|
||||
spam: spam as u32,
|
||||
ham: ham as u32,
|
||||
};
|
||||
self.insert_positive(hash, weights);
|
||||
weights
|
||||
}
|
||||
_ => {
|
||||
self.insert_negative(hash);
|
||||
Weights::default()
|
||||
}
|
||||
}
|
||||
.into()
|
||||
} else {
|
||||
// Something went wrong
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -21,6 +21,7 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use directory::DatabaseColumn;
|
||||
use sieve::{runtime::Variable, FunctionMap};
|
||||
|
||||
use crate::config::scripts::SieveContext;
|
||||
|
|
@ -62,15 +63,20 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> {
|
|||
}
|
||||
|
||||
pub fn exec_map(ctx: PluginContext<'_>) -> Variable<'static> {
|
||||
let lookup_id = ctx.arguments[0].to_cow();
|
||||
let item = ctx.arguments[1].to_cow();
|
||||
let mut arguments = ctx.arguments.into_iter();
|
||||
let lookup_id = arguments.next().unwrap().into_cow();
|
||||
let items = match arguments.next().unwrap() {
|
||||
Variable::Array(l) => l.into_iter().map(DatabaseColumn::from).collect(),
|
||||
Variable::ArrayRef(l) => l.iter().map(DatabaseColumn::from).collect(),
|
||||
v => vec![DatabaseColumn::from(v)],
|
||||
};
|
||||
let span = ctx.span;
|
||||
|
||||
if !lookup_id.is_empty() && !item.is_empty() {
|
||||
if !lookup_id.is_empty() && !items.is_empty() {
|
||||
if let Some(lookup) = ctx.core.sieve.lookup.get(lookup_id.as_ref()) {
|
||||
return ctx
|
||||
.handle
|
||||
.block_on(lookup.lookup(item.as_ref()))
|
||||
.block_on(lookup.lookup(&items))
|
||||
.unwrap_or_default();
|
||||
} else {
|
||||
tracing::warn!(
|
||||
|
|
|
|||
|
|
@ -21,13 +21,13 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
pub mod bayes;
|
||||
pub mod dns;
|
||||
pub mod exec;
|
||||
pub mod http;
|
||||
pub mod lookup;
|
||||
pub mod query;
|
||||
|
||||
use ahash::AHashMap;
|
||||
use mail_parser::Message;
|
||||
use sieve::{runtime::Variable, FunctionMap, Input};
|
||||
use tokio::runtime::Handle;
|
||||
|
|
@ -41,12 +41,11 @@ pub struct PluginContext<'x> {
|
|||
pub span: &'x tracing::Span,
|
||||
pub handle: &'x Handle,
|
||||
pub core: &'x SMTP,
|
||||
pub data: &'x mut AHashMap<String, String>,
|
||||
pub message: &'x Message<'x>,
|
||||
pub arguments: Vec<Variable<'static>>,
|
||||
}
|
||||
|
||||
const PLUGINS_EXEC: [ExecPluginFnc; 7] = [
|
||||
const PLUGINS_EXEC: [ExecPluginFnc; 10] = [
|
||||
query::exec,
|
||||
exec::exec,
|
||||
lookup::exec,
|
||||
|
|
@ -54,8 +53,11 @@ const PLUGINS_EXEC: [ExecPluginFnc; 7] = [
|
|||
dns::exec,
|
||||
dns::exec_exists,
|
||||
http::exec_header,
|
||||
bayes::exec_train,
|
||||
bayes::exec_untrain,
|
||||
bayes::exec_classify,
|
||||
];
|
||||
const PLUGINS_REGISTER: [RegisterPluginFnc; 7] = [
|
||||
const PLUGINS_REGISTER: [RegisterPluginFnc; 10] = [
|
||||
query::register,
|
||||
exec::register,
|
||||
lookup::register,
|
||||
|
|
@ -63,6 +65,9 @@ const PLUGINS_REGISTER: [RegisterPluginFnc; 7] = [
|
|||
dns::register,
|
||||
dns::register_exists,
|
||||
http::register_header,
|
||||
bayes::register_train,
|
||||
bayes::register_untrain,
|
||||
bayes::register_classify,
|
||||
];
|
||||
|
||||
pub trait RegisterSievePlugins {
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
*/
|
||||
|
||||
use crate::config::scripts::SieveContext;
|
||||
use directory::QueryColumn;
|
||||
use directory::DatabaseColumn;
|
||||
use sieve::{runtime::Variable, FunctionMap};
|
||||
|
||||
use super::PluginContext;
|
||||
|
|
@ -62,8 +62,12 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> {
|
|||
return false.into();
|
||||
}
|
||||
|
||||
// Obtain parameters
|
||||
let parameters = arguments.next().unwrap().into_string_array();
|
||||
// Obtain arguments
|
||||
let arguments = match arguments.next().unwrap() {
|
||||
Variable::Array(l) => l.into_iter().map(DatabaseColumn::from).collect(),
|
||||
Variable::ArrayRef(l) => l.iter().map(DatabaseColumn::from).collect(),
|
||||
v => vec![DatabaseColumn::from(v)],
|
||||
};
|
||||
|
||||
// Run query
|
||||
if query
|
||||
|
|
@ -71,12 +75,9 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> {
|
|||
.get(..6)
|
||||
.map_or(false, |q| q.eq_ignore_ascii_case(b"SELECT"))
|
||||
{
|
||||
if let Ok(mut query_columns) = ctx.handle.block_on(directory.query(
|
||||
&query,
|
||||
¶meters.iter().map(String::as_str).collect::<Vec<_>>(),
|
||||
)) {
|
||||
if let Ok(mut query_columns) = ctx.handle.block_on(directory.query(&query, &arguments)) {
|
||||
match query_columns.len() {
|
||||
1 if !matches!(query_columns.first(), Some(QueryColumn::Null)) => {
|
||||
1 if !matches!(query_columns.first(), Some(DatabaseColumn::Null)) => {
|
||||
query_columns.pop().map(Variable::from).unwrap()
|
||||
}
|
||||
0 => Variable::default(),
|
||||
|
|
@ -87,10 +88,7 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> {
|
|||
}
|
||||
} else {
|
||||
ctx.handle
|
||||
.block_on(directory.lookup(
|
||||
&query,
|
||||
¶meters.iter().map(String::as_str).collect::<Vec<_>>(),
|
||||
))
|
||||
.block_on(directory.lookup(&query, &arguments))
|
||||
.is_ok()
|
||||
.into()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -298,6 +298,18 @@ impl ParseValue for u64 {
|
|||
}
|
||||
}
|
||||
|
||||
impl ParseValue for f64 {
|
||||
fn parse_value(key: impl AsKey, value: &str) -> super::Result<Self> {
|
||||
value.parse().map_err(|_| {
|
||||
format!(
|
||||
"Invalid floating point value {:?} for property {:?}.",
|
||||
value,
|
||||
key.as_key()
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl ParseValue for u16 {
|
||||
fn parse_value(key: impl AsKey, value: &str) -> super::Result<Self> {
|
||||
value.parse().map_err(|_| {
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ pub mod config;
|
|||
pub mod ipc;
|
||||
pub mod listener;
|
||||
pub mod map;
|
||||
pub mod suffixlist;
|
||||
|
||||
use opentelemetry::{
|
||||
sdk::{
|
||||
|
|
|
|||
59
crates/utils/src/suffixlist.rs
Normal file
59
crates/utils/src/suffixlist.rs
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use ahash::AHashSet;
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PublicSuffix {
|
||||
pub suffixes: AHashSet<String>,
|
||||
pub exceptions: AHashSet<String>,
|
||||
pub wildcards: Vec<String>,
|
||||
}
|
||||
|
||||
impl PublicSuffix {
|
||||
pub fn contains(&self, suffix: &str) -> bool {
|
||||
self.suffixes.contains(suffix)
|
||||
|| (!self.exceptions.contains(suffix)
|
||||
&& self.wildcards.iter().any(|w| suffix.ends_with(w)))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for PublicSuffix {
|
||||
fn from(list: &str) -> Self {
|
||||
let mut ps = PublicSuffix::default();
|
||||
for line in list.lines() {
|
||||
let line = line.trim().to_lowercase();
|
||||
if !line.starts_with("//") {
|
||||
if let Some(domain) = line.strip_prefix('*') {
|
||||
ps.wildcards.push(domain.to_string());
|
||||
} else if let Some(domain) = line.strip_prefix('!') {
|
||||
ps.exceptions.insert(domain.to_string());
|
||||
} else {
|
||||
ps.suffixes.insert(line.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
ps.suffixes.insert("onion".to_string());
|
||||
ps
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,5 @@
|
|||
# Mailing list scores
|
||||
let "ml_score" "count(header.List-Id:List-Archive:List-Owner:List-Help:List-Post:X-Loop:List-Subscribe:List-Unsubscribe[*].exists) * 0.125";
|
||||
eval "print('ml_score: ' + ml_score)";
|
||||
if eval "ml_score < 1" {
|
||||
if eval "header.List-Id.exists" {
|
||||
let "ml_score" "ml_score + 0.50";
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue