Manually train and test spam classifier (closes #473 closes #264 closes #257 closes #471)

2024-09-20 15:26:17 +08:00 · 2024-06-10 13:02:52 +01:00 · 2024-06-10 13:02:52 +01:00 · 835c7d8c30
parent 8d3839a90b
commit 835c7d8c30
13 changed files with 225 additions and 16 deletions
--- a/crates/common/src/scripts/mod.rs
+++ b/crates/common/src/scripts/mod.rs
@ -8,7 +8,7 @@ use crate::IntoString;
 pub mod functions;
 pub mod plugins;
-#[derive(Debug)]
+#[derive(Debug, serde::Serialize)]
 pub enum ScriptModification {
    SetEnvelope {
        name: Envelope,
--- a/crates/common/src/scripts/plugins/bayes.rs
+++ b/crates/common/src/scripts/plugins/bayes.rs
@ -79,9 +79,17 @@ async fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable {
    let text = ctx.arguments[1].to_string();
    let is_spam = ctx.arguments[2].to_bool();
    if text.is_empty() {
        tracing::debug!(
            parent: span,
            context = "sieve:bayes_train",
            event = "failed",
            reason = "Empty message",
        );
        return false.into();
    }
    let c = println!("training: {:?} {}", text, is_spam);
    // Train the model
    let mut model = BayesModel::default();
    model.train(
@ -92,6 +100,12 @@ async fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable {
        is_spam,
    );
    if model.weights.is_empty() {
        tracing::debug!(
            parent: span,
            context = "sieve:bayes_train",
            event = "failed",
            reason = "No weights found",
        );
        return false.into();
    }
--- a/crates/jmap/src/api/management/mod.rs
+++ b/crates/jmap/src/api/management/mod.rs
@ -29,6 +29,7 @@ pub mod queue;
 pub mod reload;
 pub mod report;
 pub mod settings;
 pub mod sieve;
 pub mod stores;
 use std::{borrow::Cow, sync::Arc};
@ -89,6 +90,7 @@ impl JMAP {
            "logs" if is_superuser && req.method() == Method::GET => {
                self.handle_view_logs(req).await
            }
            "sieve" if is_superuser => self.handle_run_sieve(req, path, body).await,
            "restart" if is_superuser && req.method() == Method::GET => {
                ManagementApiError::Unsupported {
                    details: "Restart is not yet supported".into(),
--- a/crates/jmap/src/api/management/sieve.rs
+++ b/crates/jmap/src/api/management/sieve.rs
@ -0,0 +1,107 @@
 /*
 * Copyright (c) 2023 Stalwart Labs Ltd.
 *
 * This file is part of Stalwart Mail Server.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * in the LICENSE file at the top-level directory of this distribution.
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * You can be released from the requirements of the AGPLv3 license by
 * purchasing a commercial license. Please contact licensing@stalw.art
 * for more details.
 */
 use std::time::SystemTime;
 use hyper::Method;
 use jmap_proto::error::request::RequestError;
 use serde_json::json;
 use sieve::{runtime::Variable, Envelope};
 use smtp::scripts::ScriptParameters;
 use utils::url_params::UrlParams;
 use crate::{
    api::{http::ToHttpResponse, HttpRequest, HttpResponse, JsonResponse},
    JMAP,
 };
 impl JMAP {
    pub async fn handle_run_sieve(
        &self,
        req: &HttpRequest,
        path: Vec<&str>,
        body: Option<Vec<u8>>,
    ) -> HttpResponse {
        let script = match (
            path.get(1)
                .and_then(|name| self.core.sieve.scripts.get(*name))
                .cloned(),
            req.method(),
        ) {
            (Some(script), &Method::POST) => script,
            _ => {
                return RequestError::not_found().into_http_response();
            }
        };
        let mut params = ScriptParameters::new()
            .set_variable(
                "now",
                SystemTime::now()
                    .duration_since(SystemTime::UNIX_EPOCH)
                    .map_or(0, |d| d.as_secs()),
            )
            .set_variable("test", true)
            .with_message(body.as_deref().unwrap_or_default());
        let mut envelope_to = Vec::new();
        for (key, value) in UrlParams::new(req.uri().query()).into_inner() {
            let env = match key.as_ref() {
                "env_to" => {
                    envelope_to.push(Variable::from(value.to_lowercase()));
                    continue;
                }
                "env_from" => Envelope::From,
                "env_orcpt" => Envelope::Orcpt,
                "env_ret" => Envelope::Ret,
                "env_notify" => Envelope::Notify,
                "env_id" => Envelope::Envid,
                "env_bym" => Envelope::ByMode,
                "env_byt" => Envelope::ByTrace,
                "env_byta" => Envelope::ByTimeAbsolute,
                "env_bytr" => Envelope::ByTimeRelative,
                _ => {
                    params = params.set_variable(key.into_owned(), value.into_owned());
                    continue;
                }
            };
            params = params.set_envelope(env, value);
        }
        if !envelope_to.is_empty() {
            params = params.set_envelope(Envelope::To, Variable::from(envelope_to));
        }
        // Run script
        let result = self
            .smtp
            .run_script(script, params, tracing::debug_span!("sieve_manual_run"))
            .await;
        JsonResponse::new(json!({
            "data": result,
        }))
        .into_http_response()
    }
 }
--- a/crates/smtp/src/scripts/event_loop.rs
+++ b/crates/smtp/src/scripts/event_loop.rs
@ -50,7 +50,7 @@ impl SMTP {
            .core
            .sieve
            .trusted_runtime
-            .filter(params.message.as_ref().map_or(b"", |m| &m[..]))
+            .filter(params.message.unwrap_or_default())
            .with_vars_env(params.variables)
            .with_envelope_list(params.envelope)
            .with_user_address(&params.from_addr)
--- a/crates/smtp/src/scripts/mod.rs
+++ b/crates/smtp/src/scripts/mod.rs
@ -31,7 +31,7 @@ pub mod envelope;
 pub mod event_loop;
 pub mod exec;
-#[derive(Debug)]
+#[derive(Debug, serde::Serialize)]
 pub enum ScriptResult {
    Accept {
        modifications: Vec<ScriptModification>,
@ -112,6 +112,11 @@ impl<'x> ScriptParameters<'x> {
        self
    }
    pub fn set_envelope(mut self, envelope: Envelope, value: impl Into<Variable>) -> Self {
        self.envelope.push((envelope, value.into()));
        self
    }
    #[cfg(feature = "test_mode")]
    pub fn with_expected_variables(
        mut self,
--- a/crates/utils/src/url_params.rs
+++ b/crates/utils/src/url_params.rs
@ -55,4 +55,8 @@ impl<'x> UrlParams<'x> {
    {
        self.get(key).and_then(|v| v.parse().ok())
    }
    pub fn into_inner(self) -> HashMap<Cow<'x, str>, Cow<'x, str>> {
        self.params
    }
 }
--- a/resources/config/build.py
+++ b/resources/config/build.py
@ -37,12 +37,17 @@ scripts = {
    "greylist": [
                "config.sieve",
                "greylist.sieve"
    ],
    "train": [
                "config.sieve",
                "train.sieve"
    ]
 }
 script_names = {
    "spam-filter" : "Spam Filter",
    "track-replies" : "Track Replies",
-    "greylist" : "Greylisting"
+    "greylist" : "Greylisting",
    "train": "Train Bayes Classifier"
 }
 maps = ["spam_config.map",
@ -69,7 +74,7 @@ def read_file(file):
        return f.read() + "\n"
 def build_spam_filters(scripts):
-    spam_filter = "[version]\nspam-filter = \"1.0\"\n\n"
+    spam_filter = "[version]\nspam-filter = \"1.1\"\n\n"
    for script_name, file_list in scripts.items():
        script_content = read_and_concatenate(file_list).replace("'''", "\\'\\'\\'")
        script_description = script_names[script_name]
--- a/resources/config/spamfilter.toml
+++ b/resources/config/spamfilter.toml
@ -1,5 +1,5 @@
 [version]
-spam-filter = "1.0"
+spam-filter = "1.1"
 [sieve.trusted.scripts.spam-filter]
 name = "Spam Filter"
@ -17,7 +17,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
 # Whether the bayes classifier should be trained automatically
-let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -61,7 +61,7 @@ let "urls" "dedup(tokenize(header.subject, 'uri') + body_urls + html_body_urls)"
 # Obtain thread name and subject
 let "subject_lc" "to_lowercase(header.subject)";
 let "subject_clean" "thread_name(header.subject)";
-let "body_and_subject" "subject_clean + text_body";
+let "body_and_subject" "subject_clean + ' ' + text_body";
 # Obtain all recipients
 let "recipients" "to_lowercase(header.to:cc:bcc[*].addr[*])";
@ -2257,7 +2257,7 @@ while "i > 0" {
    if eval "is_empty(token_rep)" {
        # Set reputation
-        eval "key_set(SPAM_DB, token_id, [score, 1], 2592000)";
+        eval "!env.test && key_set(SPAM_DB, token_id, [score, 1], 2592000)";
        continue;
    }
@ -2265,7 +2265,7 @@ while "i > 0" {
    let "token_score" "token_rep[0]";
    let "token_count" "token_rep[1]";
    let "updated_score" "(token_count + 1) * (score + 0.98 * token_score) / (0.98 * token_count + 1)";
-    eval "key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
+    eval "!env.test && key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
    # Assign weight
    let "weight" "";
@ -2343,7 +2343,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
 # Whether the bayes classifier should be trained automatically
-let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -2403,7 +2403,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
 # Whether the bayes classifier should be trained automatically
-let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -2444,6 +2444,66 @@ if eval "!key_exists(SPAM_DB, triplet)" {
 '''
 [sieve.trusted.scripts.train]
 name = "Train Bayes Classifier"
 contents = '''
 #### Script config.sieve ####
 # Whether to add an X-Spam-Status header
 let "ADD_HEADER_SPAM" "key_get('spam-config', 'add-spam')";
 # Whether to add an X-Spam-Result header
 let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 # Whether message replies from authenticated users should be learned as ham
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
 # Whether the bayes classifier should be trained automatically
 let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
 # When to learn spam (score <= threshold)
 let "AUTOLEARN_SPAM_THRESHOLD" "key_get('spam-config', 'learn-spam-threshold')";
 # Keep difference for spam/ham learns for at least this value
 let "AUTOLEARN_SPAM_HAM_BALANCE" "key_get('spam-config', 'learn-balance')";
 # If ADD_HEADER_SPAM is enabled, mark as SPAM messages with a score above this threshold
 let "SCORE_SPAM_THRESHOLD" "key_get('spam-config', 'threshold-spam')";
 # Discard messages with a score above this threshold
 let "SCORE_DISCARD_THRESHOLD" "key_get('spam-config', 'threshold-discard')";
 # Reject messages with a score above this threshold
 let "SCORE_REJECT_THRESHOLD" "key_get('spam-config', 'threshold-reject')";
 # Directory name to use for local domain lookups (leave empty for default)
 let "DOMAIN_DIRECTORY" "key_get('spam-config', 'directory')";
 # Store to use for Bayes tokens and ids (leave empty for default)
 let "SPAM_DB" "key_get('spam-config', 'lookup')";
 #### Script train.sieve ####
 # Obtain thread name and subject
 let "contents" "thread_name(header.subject) + ' ' + body.to_text";
 if eval "env.train == 'spam'" {
    eval "bayes_train(SPAM_DB, contents, true)";
 } elsif eval "env.train == 'ham'" {
    eval "bayes_train(SPAM_DB, contents, false)";
 } else {
    reject "Missing variable 'train'";
 }
 '''
 [lookup]
 spam-config = {
--- a/resources/config/spamfilter/scripts/config.sieve
+++ b/resources/config/spamfilter/scripts/config.sieve
@ -8,7 +8,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
 let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
 # Whether the bayes classifier should be trained automatically
-let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
+let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
 # When to learn ham (score >= threshold)
 let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
--- a/resources/config/spamfilter/scripts/prelude.sieve
+++ b/resources/config/spamfilter/scripts/prelude.sieve
@ -13,7 +13,7 @@ let "urls" "dedup(tokenize(header.subject, 'uri') + body_urls + html_body_urls)"
 # Obtain thread name and subject
 let "subject_lc" "to_lowercase(header.subject)";
 let "subject_clean" "thread_name(header.subject)";
-let "body_and_subject" "subject_clean + text_body";
+let "body_and_subject" "subject_clean + ' ' + text_body";
 # Obtain all recipients
 let "recipients" "to_lowercase(header.to:cc:bcc[*].addr[*])";
--- a/resources/config/spamfilter/scripts/reputation.sieve
+++ b/resources/config/spamfilter/scripts/reputation.sieve
@ -41,7 +41,7 @@ while "i > 0" {
    if eval "is_empty(token_rep)" {
        # Set reputation
-        eval "key_set(SPAM_DB, token_id, [score, 1], 2592000)";
+        eval "!env.test && key_set(SPAM_DB, token_id, [score, 1], 2592000)";
        continue;
    }
@ -49,7 +49,7 @@ while "i > 0" {
    let "token_score" "token_rep[0]";
    let "token_count" "token_rep[1]";
    let "updated_score" "(token_count + 1) * (score + 0.98 * token_score) / (0.98 * token_count + 1)";
-    eval "key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
+    eval "!env.test && key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
    # Assign weight
    let "weight" "";
--- a/resources/config/spamfilter/scripts/train.sieve
+++ b/resources/config/spamfilter/scripts/train.sieve
@ -0,0 +1,12 @@
 # Obtain thread name and subject
 let "contents" "thread_name(header.subject) + ' ' + body.to_text";
 if eval "env.train == 'spam'" {
    eval "bayes_train(SPAM_DB, contents, true)";
 } elsif eval "env.train == 'ham'" {
    eval "bayes_train(SPAM_DB, contents, false)";
 } else {
    reject "Missing variable 'train'";
 }