Manually train and test spam classifier (closes #473 closes #264 closes #257 closes #471)

This commit is contained in:
mdecimus 2024-06-10 13:02:52 +01:00
parent 8d3839a90b
commit 835c7d8c30
13 changed files with 225 additions and 16 deletions

View file

@ -8,7 +8,7 @@ use crate::IntoString;
pub mod functions;
pub mod plugins;
#[derive(Debug)]
#[derive(Debug, serde::Serialize)]
pub enum ScriptModification {
SetEnvelope {
name: Envelope,

View file

@ -79,9 +79,17 @@ async fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable {
let text = ctx.arguments[1].to_string();
let is_spam = ctx.arguments[2].to_bool();
if text.is_empty() {
tracing::debug!(
parent: span,
context = "sieve:bayes_train",
event = "failed",
reason = "Empty message",
);
return false.into();
}
let c = println!("training: {:?} {}", text, is_spam);
// Train the model
let mut model = BayesModel::default();
model.train(
@ -92,6 +100,12 @@ async fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable {
is_spam,
);
if model.weights.is_empty() {
tracing::debug!(
parent: span,
context = "sieve:bayes_train",
event = "failed",
reason = "No weights found",
);
return false.into();
}

View file

@ -29,6 +29,7 @@ pub mod queue;
pub mod reload;
pub mod report;
pub mod settings;
pub mod sieve;
pub mod stores;
use std::{borrow::Cow, sync::Arc};
@ -89,6 +90,7 @@ impl JMAP {
"logs" if is_superuser && req.method() == Method::GET => {
self.handle_view_logs(req).await
}
"sieve" if is_superuser => self.handle_run_sieve(req, path, body).await,
"restart" if is_superuser && req.method() == Method::GET => {
ManagementApiError::Unsupported {
details: "Restart is not yet supported".into(),

View file

@ -0,0 +1,107 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::time::SystemTime;
use hyper::Method;
use jmap_proto::error::request::RequestError;
use serde_json::json;
use sieve::{runtime::Variable, Envelope};
use smtp::scripts::ScriptParameters;
use utils::url_params::UrlParams;
use crate::{
api::{http::ToHttpResponse, HttpRequest, HttpResponse, JsonResponse},
JMAP,
};
impl JMAP {
pub async fn handle_run_sieve(
&self,
req: &HttpRequest,
path: Vec<&str>,
body: Option<Vec<u8>>,
) -> HttpResponse {
let script = match (
path.get(1)
.and_then(|name| self.core.sieve.scripts.get(*name))
.cloned(),
req.method(),
) {
(Some(script), &Method::POST) => script,
_ => {
return RequestError::not_found().into_http_response();
}
};
let mut params = ScriptParameters::new()
.set_variable(
"now",
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.map_or(0, |d| d.as_secs()),
)
.set_variable("test", true)
.with_message(body.as_deref().unwrap_or_default());
let mut envelope_to = Vec::new();
for (key, value) in UrlParams::new(req.uri().query()).into_inner() {
let env = match key.as_ref() {
"env_to" => {
envelope_to.push(Variable::from(value.to_lowercase()));
continue;
}
"env_from" => Envelope::From,
"env_orcpt" => Envelope::Orcpt,
"env_ret" => Envelope::Ret,
"env_notify" => Envelope::Notify,
"env_id" => Envelope::Envid,
"env_bym" => Envelope::ByMode,
"env_byt" => Envelope::ByTrace,
"env_byta" => Envelope::ByTimeAbsolute,
"env_bytr" => Envelope::ByTimeRelative,
_ => {
params = params.set_variable(key.into_owned(), value.into_owned());
continue;
}
};
params = params.set_envelope(env, value);
}
if !envelope_to.is_empty() {
params = params.set_envelope(Envelope::To, Variable::from(envelope_to));
}
// Run script
let result = self
.smtp
.run_script(script, params, tracing::debug_span!("sieve_manual_run"))
.await;
JsonResponse::new(json!({
"data": result,
}))
.into_http_response()
}
}

View file

@ -50,7 +50,7 @@ impl SMTP {
.core
.sieve
.trusted_runtime
.filter(params.message.as_ref().map_or(b"", |m| &m[..]))
.filter(params.message.unwrap_or_default())
.with_vars_env(params.variables)
.with_envelope_list(params.envelope)
.with_user_address(&params.from_addr)

View file

@ -31,7 +31,7 @@ pub mod envelope;
pub mod event_loop;
pub mod exec;
#[derive(Debug)]
#[derive(Debug, serde::Serialize)]
pub enum ScriptResult {
Accept {
modifications: Vec<ScriptModification>,
@ -112,6 +112,11 @@ impl<'x> ScriptParameters<'x> {
self
}
pub fn set_envelope(mut self, envelope: Envelope, value: impl Into<Variable>) -> Self {
self.envelope.push((envelope, value.into()));
self
}
#[cfg(feature = "test_mode")]
pub fn with_expected_variables(
mut self,

View file

@ -55,4 +55,8 @@ impl<'x> UrlParams<'x> {
{
self.get(key).and_then(|v| v.parse().ok())
}
pub fn into_inner(self) -> HashMap<Cow<'x, str>, Cow<'x, str>> {
self.params
}
}

View file

@ -37,12 +37,17 @@ scripts = {
"greylist": [
"config.sieve",
"greylist.sieve"
],
"train": [
"config.sieve",
"train.sieve"
]
}
script_names = {
"spam-filter" : "Spam Filter",
"track-replies" : "Track Replies",
"greylist" : "Greylisting"
"greylist" : "Greylisting",
"train": "Train Bayes Classifier"
}
maps = ["spam_config.map",
@ -69,7 +74,7 @@ def read_file(file):
return f.read() + "\n"
def build_spam_filters(scripts):
spam_filter = "[version]\nspam-filter = \"1.0\"\n\n"
spam_filter = "[version]\nspam-filter = \"1.1\"\n\n"
for script_name, file_list in scripts.items():
script_content = read_and_concatenate(file_list).replace("'''", "\\'\\'\\'")
script_description = script_names[script_name]

View file

@ -1,5 +1,5 @@
[version]
spam-filter = "1.0"
spam-filter = "1.1"
[sieve.trusted.scripts.spam-filter]
name = "Spam Filter"
@ -17,7 +17,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
# Whether the bayes classifier should be trained automatically
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
# When to learn ham (score >= threshold)
let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -61,7 +61,7 @@ let "urls" "dedup(tokenize(header.subject, 'uri') + body_urls + html_body_urls)"
# Obtain thread name and subject
let "subject_lc" "to_lowercase(header.subject)";
let "subject_clean" "thread_name(header.subject)";
let "body_and_subject" "subject_clean + text_body";
let "body_and_subject" "subject_clean + ' ' + text_body";
# Obtain all recipients
let "recipients" "to_lowercase(header.to:cc:bcc[*].addr[*])";
@ -2257,7 +2257,7 @@ while "i > 0" {
if eval "is_empty(token_rep)" {
# Set reputation
eval "key_set(SPAM_DB, token_id, [score, 1], 2592000)";
eval "!env.test && key_set(SPAM_DB, token_id, [score, 1], 2592000)";
continue;
}
@ -2265,7 +2265,7 @@ while "i > 0" {
let "token_score" "token_rep[0]";
let "token_count" "token_rep[1]";
let "updated_score" "(token_count + 1) * (score + 0.98 * token_score) / (0.98 * token_count + 1)";
eval "key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
eval "!env.test && key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
# Assign weight
let "weight" "";
@ -2343,7 +2343,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
# Whether the bayes classifier should be trained automatically
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
# When to learn ham (score >= threshold)
let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -2403,7 +2403,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
# Whether the bayes classifier should be trained automatically
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
# When to learn ham (score >= threshold)
let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
@ -2444,6 +2444,66 @@ if eval "!key_exists(SPAM_DB, triplet)" {
'''
[sieve.trusted.scripts.train]
name = "Train Bayes Classifier"
contents = '''
#### Script config.sieve ####
# Whether to add an X-Spam-Status header
let "ADD_HEADER_SPAM" "key_get('spam-config', 'add-spam')";
# Whether to add an X-Spam-Result header
let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
# Whether message replies from authenticated users should be learned as ham
let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
# Whether the bayes classifier should be trained automatically
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
# When to learn ham (score >= threshold)
let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";
# When to learn spam (score <= threshold)
let "AUTOLEARN_SPAM_THRESHOLD" "key_get('spam-config', 'learn-spam-threshold')";
# Keep difference for spam/ham learns for at least this value
let "AUTOLEARN_SPAM_HAM_BALANCE" "key_get('spam-config', 'learn-balance')";
# If ADD_HEADER_SPAM is enabled, mark as SPAM messages with a score above this threshold
let "SCORE_SPAM_THRESHOLD" "key_get('spam-config', 'threshold-spam')";
# Discard messages with a score above this threshold
let "SCORE_DISCARD_THRESHOLD" "key_get('spam-config', 'threshold-discard')";
# Reject messages with a score above this threshold
let "SCORE_REJECT_THRESHOLD" "key_get('spam-config', 'threshold-reject')";
# Directory name to use for local domain lookups (leave empty for default)
let "DOMAIN_DIRECTORY" "key_get('spam-config', 'directory')";
# Store to use for Bayes tokens and ids (leave empty for default)
let "SPAM_DB" "key_get('spam-config', 'lookup')";
#### Script train.sieve ####
# Obtain thread name and subject
let "contents" "thread_name(header.subject) + ' ' + body.to_text";
if eval "env.train == 'spam'" {
eval "bayes_train(SPAM_DB, contents, true)";
} elsif eval "env.train == 'ham'" {
eval "bayes_train(SPAM_DB, contents, false)";
} else {
reject "Missing variable 'train'";
}
'''
[lookup]
spam-config = {

View file

@ -8,7 +8,7 @@ let "ADD_HEADER_SPAM_RESULT" "key_get('spam-config', 'add-spam-result')";
let "AUTOLEARN_REPLIES_HAM" "key_get('spam-config', 'learn-ham-replies')";
# Whether the bayes classifier should be trained automatically
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable')";
let "AUTOLEARN_ENABLE" "key_get('spam-config', 'learn-enable') && !env.test";
# When to learn ham (score >= threshold)
let "AUTOLEARN_HAM_THRESHOLD" "key_get('spam-config', 'learn-ham-threshold')";

View file

@ -13,7 +13,7 @@ let "urls" "dedup(tokenize(header.subject, 'uri') + body_urls + html_body_urls)"
# Obtain thread name and subject
let "subject_lc" "to_lowercase(header.subject)";
let "subject_clean" "thread_name(header.subject)";
let "body_and_subject" "subject_clean + text_body";
let "body_and_subject" "subject_clean + ' ' + text_body";
# Obtain all recipients
let "recipients" "to_lowercase(header.to:cc:bcc[*].addr[*])";

View file

@ -41,7 +41,7 @@ while "i > 0" {
if eval "is_empty(token_rep)" {
# Set reputation
eval "key_set(SPAM_DB, token_id, [score, 1], 2592000)";
eval "!env.test && key_set(SPAM_DB, token_id, [score, 1], 2592000)";
continue;
}
@ -49,7 +49,7 @@ while "i > 0" {
let "token_score" "token_rep[0]";
let "token_count" "token_rep[1]";
let "updated_score" "(token_count + 1) * (score + 0.98 * token_score) / (0.98 * token_count + 1)";
eval "key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
eval "!env.test && key_set(SPAM_DB, token_id, [updated_score, token_count + 1], 2592000)";
# Assign weight
let "weight" "";

View file

@ -0,0 +1,12 @@
# Obtain thread name and subject
let "contents" "thread_name(header.subject) + ' ' + body.to_text";
if eval "env.train == 'spam'" {
eval "bayes_train(SPAM_DB, contents, true)";
} elsif eval "env.train == 'ham'" {
eval "bayes_train(SPAM_DB, contents, false)";
} else {
reject "Missing variable 'train'";
}