Manually train and test spam classifier (closes #473 closes #264 closes #257 closes #471)

2024-09-20 07:16:18 +08:00 · 2024-06-10 13:02:52 +01:00 · 2024-06-10 13:02:52 +01:00 · 835c7d8c30
parent 8d3839a90b
commit 835c7d8c30
13 changed files with 225 additions and 16 deletions
--- a/crates/common/src/scripts/mod.rs
+++ b/crates/common/src/scripts/mod.rs
@ -8,7 +8,7 @@ use crate::IntoString;
 pub mod functions;
 pub mod plugins;

-#[derive(Debug)]
+#[derive(Debug, serde::Serialize)]
 pub enum ScriptModification {
    SetEnvelope {
        name: Envelope,
--- a/crates/common/src/scripts/plugins/bayes.rs
+++ b/crates/common/src/scripts/plugins/bayes.rs
@ -79,9 +79,17 @@ async fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable {
    let text = ctx.arguments[1].to_string();
    let is_spam = ctx.arguments[2].to_bool();
    if text.is_empty() {
+        tracing::debug!(
+            parent: span,
+            context = "sieve:bayes_train",
+            event = "failed",
+            reason = "Empty message",
+        );
        return false.into();
    }

+    let c = println!("training: {:?} {}", text, is_spam);
+
    // Train the model
    let mut model = BayesModel::default();
    model.train(
@ -92,6 +100,12 @@ async fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable {
        is_spam,
    );
    if model.weights.is_empty() {
+        tracing::debug!(
+            parent: span,
+            context = "sieve:bayes_train",
+            event = "failed",
+            reason = "No weights found",
+        );
        return false.into();
    }

--- a/crates/jmap/src/api/management/mod.rs
+++ b/crates/jmap/src/api/management/mod.rs
@ -29,6 +29,7 @@ pub mod queue;
 pub mod reload;
 pub mod report;
 pub mod settings;
+pub mod sieve;
 pub mod stores;

 use std::{borrow::Cow, sync::Arc};
@ -89,6 +90,7 @@ impl JMAP {
            "logs" if is_superuser && req.method() == Method::GET => {
                self.handle_view_logs(req).await
            }
+            "sieve" if is_superuser => self.handle_run_sieve(req, path, body).await,
            "restart" if is_superuser && req.method() == Method::GET => {
                ManagementApiError::Unsupported {
                    details: "Restart is not yet supported".into(),
--- a/crates/jmap/src/api/management/sieve.rs
+++ b/crates/jmap/src/api/management/sieve.rs
@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::time::SystemTime;
+
+use hyper::Method;
+use jmap_proto::error::request::RequestError;
+use serde_json::json;
+use sieve::{runtime::Variable, Envelope};
+use smtp::scripts::ScriptParameters;
+use utils::url_params::UrlParams;
+
+use crate::{
+    api::{http::ToHttpResponse, HttpRequest, HttpResponse, JsonResponse},
+    JMAP,
+};
+
+impl JMAP {
+    pub async fn handle_run_sieve(
+        &self,
+        req: &HttpRequest,
+        path: Vec<&str>,
+        body: Option<Vec<u8>>,
+    ) -> HttpResponse {
+        let script = match (
+            path.get(1)
+                .and_then(|name| self.core.sieve.scripts.get(*name))
+                .cloned(),
+            req.method(),
+        ) {
+            (Some(script), &Method::POST) => script,
+            _ => {
+                return RequestError::not_found().into_http_response();
+            }
+        };
+
+        let mut params = ScriptParameters::new()
+            .set_variable(
+                "now",
+                SystemTime::now()
+                    .duration_since(SystemTime::UNIX_EPOCH)
+                    .map_or(0, |d| d.as_secs()),
+            )
+            .set_variable("test", true)
+            .with_message(body.as_deref().unwrap_or_default());
+
+        let mut envelope_to = Vec::new();
+        for (key, value) in UrlParams::new(req.uri().query()).into_inner() {
+            let env = match key.as_ref() {
+                "env_to" => {
+                    envelope_to.push(Variable::from(value.to_lowercase()));
+                    continue;
+                }
+                "env_from" => Envelope::From,
+                "env_orcpt" => Envelope::Orcpt,
+                "env_ret" => Envelope::Ret,
+                "env_notify" => Envelope::Notify,
+                "env_id" => Envelope::Envid,
+                "env_bym" => Envelope::ByMode,
+                "env_byt" => Envelope::ByTrace,
+                "env_byta" => Envelope::ByTimeAbsolute,
+                "env_bytr" => Envelope::ByTimeRelative,
+                _ => {
+                    params = params.set_variable(key.into_owned(), value.into_owned());
+                    continue;
+                }
+            };
+
+            params = params.set_envelope(env, value);
+        }
+
+        if !envelope_to.is_empty() {
+            params = params.set_envelope(Envelope::To, Variable::from(envelope_to));
+        }
+
+        // Run script
+        let result = self
+            .smtp
+            .run_script(script, params, tracing::debug_span!("sieve_manual_run"))
+            .await;
+
+        JsonResponse::new(json!({
+            "data": result,
+        }))
+        .into_http_response()
+    }
+}
--- a/crates/smtp/src/scripts/event_loop.rs
+++ b/crates/smtp/src/scripts/event_loop.rs
@ -50,7 +50,7 @@ impl SMTP {
            .core
            .sieve
            .trusted_runtime
-            .filter(params.message.as_ref().map_or(b"", |m| &m[..]))
+            .filter(params.message.unwrap_or_default())
            .with_vars_env(params.variables)
            .with_envelope_list(params.envelope)
            .with_user_address(&params.from_addr)
--- a/crates/smtp/src/scripts/mod.rs
+++ b/crates/smtp/src/scripts/mod.rs
@ -31,7 +31,7 @@ pub mod envelope;
 pub mod event_loop;
 pub mod exec;

-#[derive(Debug)]
+#[derive(Debug, serde::Serialize)]
 pub enum ScriptResult {
    Accept {
        modifications: Vec<ScriptModification>,
@ -112,6 +112,11 @@ impl<'x> ScriptParameters<'x> {
        self
    }

+    pub fn set_envelope(mut self, envelope: Envelope, value: impl Into<Variable>) -> Self {
+        self.envelope.push((envelope, value.into()));
+        self
+    }
+
    #[cfg(feature = "test_mode")]
    pub fn with_expected_variables(
        mut self,
--- a/crates/utils/src/url_params.rs
+++ b/crates/utils/src/url_params.rs
@ -55,4 +55,8 @@ impl<'x> UrlParams<'x> {
    {
        self.get(key).and_then(|v| v.parse().ok())
    }
+
+    pub fn into_inner(self) -> HashMap<Cow<'x, str>, Cow<'x, str>> {
+        self.params
+    }
 }
--- a/resources/config/build.py
+++ b/resources/config/build.py
@ -37,12 +37,17 @@ scripts = {
    "greylist": [
                "config.sieve",
                "greylist.sieve"
+    ],
+    "train": [
+                "config.sieve",
+                "train.sieve"
    ]
 }
 script_names = {
    "spam-filter" : "Spam Filter",
    "track-replies" : "Track Replies",
-    "greylist" : "Greylisting"
+    "greylist" : "Greylisting",
+    "train": "Train Bayes Classifier"
 }

 maps = ["spam_config.map",
@ -69,7 +74,7 @@ def read_file(file):
        return f.read() + "\n"

 def build_spam_filters(scripts):
-    spam_filter = "[version]\nspam-filter = \"1.0\"\n\n"
+    spam_filter = "[version]\nspam-filter = \"1.1\"\n\n"
    for script_name, file_list in scripts.items():
        script_content = read_and_concatenate(file_list).replace("'''", "\\'\\'\\'")
        script_description = script_names[script_name]
--- a/resources/config/spamfilter.toml
+++ b/resources/config/spamfilter.toml
@ -1,5 +1,5 @@
 [version]
-spam-filter = "1.0"
+spam-filter = "1.1"

 [sieve.trusted.scripts.spam-filter]
 name = "Spam Filter"
@ -17,7 +17,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";

 # Whether the bayes classifier should be trained automatically
-let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";

 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -61,7 +61,7 @@ let "urls" "dedup(tokenize(header.subject, 'uri') + body_urls + html_body_urls)"
 # Obtain thread name and subject
 let "subject_lc" "to_lowercase(header.subject)";
 let "subject_clean" "thread_name(header.subject)";
-let "body_and_subject" "subject_clean + text_body";
+let "body_and_subject" "subject_clean + ' ' + text_body";

 # Obtain all recipients
 let "recipients" "to_lowercase(header.to:cc:bcc[*].addr[*])";
@ -2257,7 +2257,7 @@ while "i > 0" {

    if eval "is_empty(token_rep)" {
        # Set reputation
-        eval "key_set(SPAM_DB, token_id, [score, 1], 2592000)";
+        eval "!env.test && key_set(SPAM_DB, token_id, [score, 1], 2592000)";
        continue;
    }

@ -2265,7 +2265,7 @@ while "i > 0" {
    let "token_score" "token_rep[0]";
    let "token_count" "token_rep[1]";
    let "updated_score" "(token_count + 1) * (score + 0.98 * token_score) / (0.98 * token_count + 1)";
-    eval "key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
+    eval "!env.test && key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";

    # Assign weight
    let "weight" "";
@ -2343,7 +2343,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";

 # Whether the bayes classifier should be trained automatically
-let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";

 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -2403,7 +2403,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";

 # Whether the bayes classifier should be trained automatically
-let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";

 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -2444,6 +2444,66 @@ if eval "!key_exists(SPAM_DB, triplet)" {

 '''

+[sieve.trusted.scripts.train]
+name = "Train Bayes Classifier"
+contents = '''
+
+#### Script config.sieve ####
+
+# Whether to add an X-Spam-Status header
+let "ADD_HEADER_SPAM" "key_get('spam-config', 'add-spam')";
+
+# Whether to add an X-Spam-Result header
+let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
+
+# Whether message replies from authenticated users should be learned as ham
+let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
+
+# Whether the bayes classifier should be trained automatically
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
+
+# When to learn ham (score >= threshold)
+let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
+
+# When to learn spam (score <= threshold)
+let "AUTOLEARN_SPAM_THRESHOLD" "key_get('spam-config', 'learn-spam-threshold')";
+
+# Keep difference for spam/ham learns for at least this value
+let "AUTOLEARN_SPAM_HAM_BALANCE" "key_get('spam-config', 'learn-balance')";
+
+# If ADD_HEADER_SPAM is enabled, mark as SPAM messages with a score above this threshold
+let "SCORE_SPAM_THRESHOLD" "key_get('spam-config', 'threshold-spam')";
+
+# Discard messages with a score above this threshold
+let "SCORE_DISCARD_THRESHOLD" "key_get('spam-config', 'threshold-discard')";
+
+# Reject messages with a score above this threshold
+let "SCORE_REJECT_THRESHOLD" "key_get('spam-config', 'threshold-reject')";
+
+# Directory name to use for local domain lookups (leave empty for default)
+let "DOMAIN_DIRECTORY" "key_get('spam-config', 'directory')";
+
+# Store to use for Bayes tokens and ids (leave empty for default)
+let "SPAM_DB" "key_get('spam-config', 'lookup')";
+
+
+#### Script train.sieve ####
+
+
+
+# Obtain thread name and subject
+let "contents" "thread_name(header.subject) + ' ' + body.to_text";
+
+if eval "env.train == 'spam'" {
+    eval "bayes_train(SPAM_DB, contents, true)";
+} elsif eval "env.train == 'ham'" {
+    eval "bayes_train(SPAM_DB, contents, false)";
+} else {
+    reject "Missing variable 'train'";
+}
+
+'''
+

 [lookup]
 spam-config = {
--- a/resources/config/spamfilter/scripts/config.sieve
+++ b/resources/config/spamfilter/scripts/config.sieve
@ -8,7 +8,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";

 # Whether the bayes classifier should be trained automatically
-let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";

 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
--- a/resources/config/spamfilter/scripts/prelude.sieve
+++ b/resources/config/spamfilter/scripts/prelude.sieve
@ -13,7 +13,7 @@ let "urls" "dedup(tokenize(header.subject, 'uri') + body_urls + html_body_urls)"
 # Obtain thread name and subject
 let "subject_lc" "to_lowercase(header.subject)";
 let "subject_clean" "thread_name(header.subject)";
-let "body_and_subject" "subject_clean + text_body";
+let "body_and_subject" "subject_clean + ' ' + text_body";

 # Obtain all recipients
 let "recipients" "to_lowercase(header.to:cc:bcc[*].addr[*])";
--- a/resources/config/spamfilter/scripts/reputation.sieve
+++ b/resources/config/spamfilter/scripts/reputation.sieve
@ -41,7 +41,7 @@ while "i > 0" {

    if eval "is_empty(token_rep)" {
        # Set reputation
-        eval "key_set(SPAM_DB, token_id, [score, 1], 2592000)";
+        eval "!env.test && key_set(SPAM_DB, token_id, [score, 1], 2592000)";
        continue;
    }

@ -49,7 +49,7 @@ while "i > 0" {
    let "token_score" "token_rep[0]";
    let "token_count" "token_rep[1]";
    let "updated_score" "(token_count + 1) * (score + 0.98 * token_score) / (0.98 * token_count + 1)";
-    eval "key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
+    eval "!env.test && key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";

    # Assign weight
    let "weight" "";
--- a/resources/config/spamfilter/scripts/train.sieve
+++ b/resources/config/spamfilter/scripts/train.sieve
@ -0,0 +1,12 @@
+
+
+# Obtain thread name and subject
+let "contents" "thread_name(header.subject) + ' ' + body.to_text";
+
+if eval "env.train == 'spam'" {
+    eval "bayes_train(SPAM_DB, contents, true)";
+} elsif eval "env.train == 'ham'" {
+    eval "bayes_train(SPAM_DB, contents, false)";
+} else {
+    reject "Missing variable 'train'";
+}