diff --git a/crates/common/src/scripts/mod.rs b/crates/common/src/scripts/mod.rs index 2aec6814..01d16f9b 100644 --- a/crates/common/src/scripts/mod.rs +++ b/crates/common/src/scripts/mod.rs @@ -8,7 +8,7 @@ use crate::IntoString; pub mod functions; pub mod plugins; -#[derive(Debug)] +#[derive(Debug, serde::Serialize)] pub enum ScriptModification { SetEnvelope { name: Envelope, diff --git a/crates/common/src/scripts/plugins/bayes.rs b/crates/common/src/scripts/plugins/bayes.rs index d96285d4..3ad583a9 100644 --- a/crates/common/src/scripts/plugins/bayes.rs +++ b/crates/common/src/scripts/plugins/bayes.rs @@ -79,9 +79,17 @@ async fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable { let text = ctx.arguments[1].to_string(); let is_spam = ctx.arguments[2].to_bool(); if text.is_empty() { + tracing::debug!( + parent: span, + context = "sieve:bayes_train", + event = "failed", + reason = "Empty message", + ); return false.into(); } + let c = println!("training: {:?} {}", text, is_spam); + // Train the model let mut model = BayesModel::default(); model.train( @@ -92,6 +100,12 @@ async fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable { is_spam, ); if model.weights.is_empty() { + tracing::debug!( + parent: span, + context = "sieve:bayes_train", + event = "failed", + reason = "No weights found", + ); return false.into(); } diff --git a/crates/jmap/src/api/management/mod.rs b/crates/jmap/src/api/management/mod.rs index 463b7507..684b5ea0 100644 --- a/crates/jmap/src/api/management/mod.rs +++ b/crates/jmap/src/api/management/mod.rs @@ -29,6 +29,7 @@ pub mod queue; pub mod reload; pub mod report; pub mod settings; +pub mod sieve; pub mod stores; use std::{borrow::Cow, sync::Arc}; @@ -89,6 +90,7 @@ impl JMAP { "logs" if is_superuser && req.method() == Method::GET => { self.handle_view_logs(req).await } + "sieve" if is_superuser => self.handle_run_sieve(req, path, body).await, "restart" if is_superuser && req.method() == Method::GET => { ManagementApiError::Unsupported { details: "Restart is not yet supported".into(), diff --git a/crates/jmap/src/api/management/sieve.rs b/crates/jmap/src/api/management/sieve.rs new file mode 100644 index 00000000..9e6f0c60 --- /dev/null +++ b/crates/jmap/src/api/management/sieve.rs @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::time::SystemTime; + +use hyper::Method; +use jmap_proto::error::request::RequestError; +use serde_json::json; +use sieve::{runtime::Variable, Envelope}; +use smtp::scripts::ScriptParameters; +use utils::url_params::UrlParams; + +use crate::{ + api::{http::ToHttpResponse, HttpRequest, HttpResponse, JsonResponse}, + JMAP, +}; + +impl JMAP { + pub async fn handle_run_sieve( + &self, + req: &HttpRequest, + path: Vec<&str>, + body: Option>, + ) -> HttpResponse { + let script = match ( + path.get(1) + .and_then(|name| self.core.sieve.scripts.get(*name)) + .cloned(), + req.method(), + ) { + (Some(script), &Method::POST) => script, + _ => { + return RequestError::not_found().into_http_response(); + } + }; + + let mut params = ScriptParameters::new() + .set_variable( + "now", + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_or(0, |d| d.as_secs()), + ) + .set_variable("test", true) + .with_message(body.as_deref().unwrap_or_default()); + + let mut envelope_to = Vec::new(); + for (key, value) in UrlParams::new(req.uri().query()).into_inner() { + let env = match key.as_ref() { + "env_to" => { + envelope_to.push(Variable::from(value.to_lowercase())); + continue; + } + "env_from" => Envelope::From, + "env_orcpt" => Envelope::Orcpt, + "env_ret" => Envelope::Ret, + "env_notify" => Envelope::Notify, + "env_id" => Envelope::Envid, + "env_bym" => Envelope::ByMode, + "env_byt" => Envelope::ByTrace, + "env_byta" => Envelope::ByTimeAbsolute, + "env_bytr" => Envelope::ByTimeRelative, + _ => { + params = params.set_variable(key.into_owned(), value.into_owned()); + continue; + } + }; + + params = params.set_envelope(env, value); + } + + if !envelope_to.is_empty() { + params = params.set_envelope(Envelope::To, Variable::from(envelope_to)); + } + + // Run script + let result = self + .smtp + .run_script(script, params, tracing::debug_span!("sieve_manual_run")) + .await; + + JsonResponse::new(json!({ + "data": result, + })) + .into_http_response() + } +} diff --git a/crates/smtp/src/scripts/event_loop.rs b/crates/smtp/src/scripts/event_loop.rs index 218b612b..2f680727 100644 --- a/crates/smtp/src/scripts/event_loop.rs +++ b/crates/smtp/src/scripts/event_loop.rs @@ -50,7 +50,7 @@ impl SMTP { .core .sieve .trusted_runtime - .filter(params.message.as_ref().map_or(b"", |m| &m[..])) + .filter(params.message.unwrap_or_default()) .with_vars_env(params.variables) .with_envelope_list(params.envelope) .with_user_address(¶ms.from_addr) diff --git a/crates/smtp/src/scripts/mod.rs b/crates/smtp/src/scripts/mod.rs index 63d9598a..b744a34a 100644 --- a/crates/smtp/src/scripts/mod.rs +++ b/crates/smtp/src/scripts/mod.rs @@ -31,7 +31,7 @@ pub mod envelope; pub mod event_loop; pub mod exec; -#[derive(Debug)] +#[derive(Debug, serde::Serialize)] pub enum ScriptResult { Accept { modifications: Vec, @@ -112,6 +112,11 @@ impl<'x> ScriptParameters<'x> { self } + pub fn set_envelope(mut self, envelope: Envelope, value: impl Into) -> Self { + self.envelope.push((envelope, value.into())); + self + } + #[cfg(feature = "test_mode")] pub fn with_expected_variables( mut self, diff --git a/crates/utils/src/url_params.rs b/crates/utils/src/url_params.rs index e9f78101..6588914d 100644 --- a/crates/utils/src/url_params.rs +++ b/crates/utils/src/url_params.rs @@ -55,4 +55,8 @@ impl<'x> UrlParams<'x> { { self.get(key).and_then(|v| v.parse().ok()) } + + pub fn into_inner(self) -> HashMap, Cow<'x, str>> { + self.params + } } diff --git a/resources/config/build.py b/resources/config/build.py index 5af133b2..7bb9ed0b 100644 --- a/resources/config/build.py +++ b/resources/config/build.py @@ -37,12 +37,17 @@ scripts = { "greylist": [ "config.sieve", "greylist.sieve" + ], + "train": [ + "config.sieve", + "train.sieve" ] } script_names = { "spam-filter" : "Spam Filter", "track-replies" : "Track Replies", - "greylist" : "Greylisting" + "greylist" : "Greylisting", + "train": "Train Bayes Classifier" } maps = ["spam_config.map", @@ -69,7 +74,7 @@ def read_file(file): return f.read() + "\n" def build_spam_filters(scripts): - spam_filter = "[version]\nspam-filter = \"1.0\"\n\n" + spam_filter = "[version]\nspam-filter = \"1.1\"\n\n" for script_name, file_list in scripts.items(): script_content = read_and_concatenate(file_list).replace("'''", "\\'\\'\\'") script_description = script_names[script_name] diff --git a/resources/config/spamfilter.toml b/resources/config/spamfilter.toml index 97706a75..e07edc7a 100644 --- a/resources/config/spamfilter.toml +++ b/resources/config/spamfilter.toml @@ -1,5 +1,5 @@ [version] -spam-filter = "1.0" +spam-filter = "1.1" [sieve.trusted.scripts.spam-filter] name = "Spam Filter" @@ -17,7 +17,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')"; let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')"; # Whether the bayes classifier should be trained automatically -let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')"; +let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test"; # When to learn ham (score >= threshold) let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')"; @@ -61,7 +61,7 @@ let "urls" "dedup(tokenize(header.subject, 'uri') + body_urls + html_body_urls)" # Obtain thread name and subject let "subject_lc" "to_lowercase(header.subject)"; let "subject_clean" "thread_name(header.subject)"; -let "body_and_subject" "subject_clean + text_body"; +let "body_and_subject" "subject_clean + ' ' + text_body"; # Obtain all recipients let "recipients" "to_lowercase(header.to:cc:bcc[*].addr[*])"; @@ -2257,7 +2257,7 @@ while "i > 0" { if eval "is_empty(token_rep)" { # Set reputation - eval "key_set(SPAM_DB, token_id, [score, 1], 2592000)"; + eval "!env.test && key_set(SPAM_DB, token_id, [score, 1], 2592000)"; continue; } @@ -2265,7 +2265,7 @@ while "i > 0" { let "token_score" "token_rep[0]"; let "token_count" "token_rep[1]"; let "updated_score" "(token_count + 1) * (score + 0.98 * token_score) / (0.98 * token_count + 1)"; - eval "key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)"; + eval "!env.test && key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)"; # Assign weight let "weight" ""; @@ -2343,7 +2343,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')"; let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')"; # Whether the bayes classifier should be trained automatically -let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')"; +let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test"; # When to learn ham (score >= threshold) let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')"; @@ -2403,7 +2403,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')"; let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')"; # Whether the bayes classifier should be trained automatically -let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')"; +let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test"; # When to learn ham (score >= threshold) let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')"; @@ -2444,6 +2444,66 @@ if eval "!key_exists(SPAM_DB, triplet)" { ''' +[sieve.trusted.scripts.train] +name = "Train Bayes Classifier" +contents = ''' + +#### Script config.sieve #### + +# Whether to add an X-Spam-Status header +let "ADD_HEADER_SPAM" "key_get('spam-config', 'add-spam')"; + +# Whether to add an X-Spam-Result header +let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')"; + +# Whether message replies from authenticated users should be learned as ham +let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')"; + +# Whether the bayes classifier should be trained automatically +let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test"; + +# When to learn ham (score >= threshold) +let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')"; + +# When to learn spam (score <= threshold) +let "AUTOLEARN_SPAM_THRESHOLD" "key_get('spam-config', 'learn-spam-threshold')"; + +# Keep difference for spam/ham learns for at least this value +let "AUTOLEARN_SPAM_HAM_BALANCE" "key_get('spam-config', 'learn-balance')"; + +# If ADD_HEADER_SPAM is enabled, mark as SPAM messages with a score above this threshold +let "SCORE_SPAM_THRESHOLD" "key_get('spam-config', 'threshold-spam')"; + +# Discard messages with a score above this threshold +let "SCORE_DISCARD_THRESHOLD" "key_get('spam-config', 'threshold-discard')"; + +# Reject messages with a score above this threshold +let "SCORE_REJECT_THRESHOLD" "key_get('spam-config', 'threshold-reject')"; + +# Directory name to use for local domain lookups (leave empty for default) +let "DOMAIN_DIRECTORY" "key_get('spam-config', 'directory')"; + +# Store to use for Bayes tokens and ids (leave empty for default) +let "SPAM_DB" "key_get('spam-config', 'lookup')"; + + +#### Script train.sieve #### + + + +# Obtain thread name and subject +let "contents" "thread_name(header.subject) + ' ' + body.to_text"; + +if eval "env.train == 'spam'" { + eval "bayes_train(SPAM_DB, contents, true)"; +} elsif eval "env.train == 'ham'" { + eval "bayes_train(SPAM_DB, contents, false)"; +} else { + reject "Missing variable 'train'"; +} + +''' + [lookup] spam-config = { diff --git a/resources/config/spamfilter/scripts/config.sieve b/resources/config/spamfilter/scripts/config.sieve index 6499f41d..33d69816 100644 --- a/resources/config/spamfilter/scripts/config.sieve +++ b/resources/config/spamfilter/scripts/config.sieve @@ -8,7 +8,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')"; let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')"; # Whether the bayes classifier should be trained automatically -let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')"; +let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test"; # When to learn ham (score >= threshold) let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')"; diff --git a/resources/config/spamfilter/scripts/prelude.sieve b/resources/config/spamfilter/scripts/prelude.sieve index c50de1ef..86367716 100644 --- a/resources/config/spamfilter/scripts/prelude.sieve +++ b/resources/config/spamfilter/scripts/prelude.sieve @@ -13,7 +13,7 @@ let "urls" "dedup(tokenize(header.subject, 'uri') + body_urls + html_body_urls)" # Obtain thread name and subject let "subject_lc" "to_lowercase(header.subject)"; let "subject_clean" "thread_name(header.subject)"; -let "body_and_subject" "subject_clean + text_body"; +let "body_and_subject" "subject_clean + ' ' + text_body"; # Obtain all recipients let "recipients" "to_lowercase(header.to:cc:bcc[*].addr[*])"; diff --git a/resources/config/spamfilter/scripts/reputation.sieve b/resources/config/spamfilter/scripts/reputation.sieve index 71ff4e99..0974f4e7 100644 --- a/resources/config/spamfilter/scripts/reputation.sieve +++ b/resources/config/spamfilter/scripts/reputation.sieve @@ -41,7 +41,7 @@ while "i > 0" { if eval "is_empty(token_rep)" { # Set reputation - eval "key_set(SPAM_DB, token_id, [score, 1], 2592000)"; + eval "!env.test && key_set(SPAM_DB, token_id, [score, 1], 2592000)"; continue; } @@ -49,7 +49,7 @@ while "i > 0" { let "token_score" "token_rep[0]"; let "token_count" "token_rep[1]"; let "updated_score" "(token_count + 1) * (score + 0.98 * token_score) / (0.98 * token_count + 1)"; - eval "key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)"; + eval "!env.test && key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)"; # Assign weight let "weight" ""; diff --git a/resources/config/spamfilter/scripts/train.sieve b/resources/config/spamfilter/scripts/train.sieve new file mode 100644 index 00000000..8cae688e --- /dev/null +++ b/resources/config/spamfilter/scripts/train.sieve @@ -0,0 +1,12 @@ + + +# Obtain thread name and subject +let "contents" "thread_name(header.subject) + ' ' + body.to_text"; + +if eval "env.train == 'spam'" { + eval "bayes_train(SPAM_DB, contents, true)"; +} elsif eval "env.train == 'ham'" { + eval "bayes_train(SPAM_DB, contents, false)"; +} else { + reject "Missing variable 'train'"; +}