From 043b53f3e9a2f077d30fec546a3e02ad5683e695 Mon Sep 17 00:00:00 2001 From: mdecimus Date: Sun, 5 Jan 2025 19:31:07 +0100 Subject: [PATCH] Fixed MIXED_CHARSET spam filter check --- crates/common/src/scripts/functions/mod.rs | 2 +- .../common/src/scripts/functions/unicode.rs | 9 +++++---- crates/common/src/scripts/mod.rs | 19 +++++++++++++++++++ crates/spam-filter/src/analysis/mime.rs | 10 ++++++---- crates/spam-filter/src/analysis/url.rs | 4 ++-- tests/resources/smtp/antispam/combined.test | 4 ++-- tests/resources/smtp/antispam/url.test | 2 +- 7 files changed, 36 insertions(+), 14 deletions(-) diff --git a/crates/common/src/scripts/functions/mod.rs b/crates/common/src/scripts/functions/mod.rs index e0f27e67..18c09ba0 100644 --- a/crates/common/src/scripts/functions/mod.rs +++ b/crates/common/src/scripts/functions/mod.rs @@ -51,7 +51,7 @@ pub fn register_functions_trusted() -> FunctionMap { .with_function("winnow", fn_winnow) .with_function("has_zwsp", fn_has_zwsp) .with_function("has_obscured", fn_has_obscured) - .with_function("is_single_script", fn_is_single_script) + .with_function("is_mixed_charset", fn_is_mixed_charset) .with_function("puny_decode", fn_puny_decode) .with_function("unicode_skeleton", fn_unicode_skeleton) .with_function("cure_text", fn_cure_text) diff --git a/crates/common/src/scripts/functions/unicode.rs b/crates/common/src/scripts/functions/unicode.rs index 941c164d..720fc56f 100644 --- a/crates/common/src/scripts/functions/unicode.rs +++ b/crates/common/src/scripts/functions/unicode.rs @@ -5,7 +5,8 @@ */ use sieve::{runtime::Variable, Context}; -use unicode_security::MixedScript; + +use crate::scripts::IsMixedCharset; pub fn fn_is_ascii<'x>(_: &'x Context<'x>, v: Vec) -> Variable { match &v[0] { @@ -80,12 +81,12 @@ pub fn fn_unicode_skeleton<'x>(_: &'x Context<'x>, v: Vec) -> Variable .into() } -pub fn fn_is_single_script<'x>(_: &'x Context<'x>, v: Vec) -> Variable { +pub fn fn_is_mixed_charset<'x>(_: &'x Context<'x>, v: Vec) -> Variable { let text = v[0].to_string(); if !text.is_empty() { - text.as_ref().is_single_script() + text.as_ref().is_mixed_charset() } else { - true + false } .into() } diff --git a/crates/common/src/scripts/mod.rs b/crates/common/src/scripts/mod.rs index d0384875..0ffbb9ae 100644 --- a/crates/common/src/scripts/mod.rs +++ b/crates/common/src/scripts/mod.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use sieve::{runtime::Variable, Envelope}; use store::Value; +use unicode_security::mixed_script::AugmentedScriptSet; use crate::IntoString; @@ -56,3 +57,21 @@ pub fn to_store_value(value: &Variable) -> Value<'static> { v => Value::Text(v.to_string().into_owned().into()), } } + +pub trait IsMixedCharset { + fn is_mixed_charset(&self) -> bool; +} + +impl> IsMixedCharset for T { + fn is_mixed_charset(&self) -> bool { + let mut set: Option = None; + + for ch in self.as_ref().chars() { + if !ch.is_ascii() { + set.get_or_insert_default().intersect_with(ch.into()); + } + } + + set.map_or(false, |set| set.is_empty()) + } +} diff --git a/crates/spam-filter/src/analysis/mime.rs b/crates/spam-filter/src/analysis/mime.rs index 03ac3af4..198295c2 100644 --- a/crates/spam-filter/src/analysis/mime.rs +++ b/crates/spam-filter/src/analysis/mime.rs @@ -7,12 +7,14 @@ use std::{collections::HashSet, future::Future, vec}; use common::{ - scripts::functions::{array::cosine_similarity, unicode::CharUtils}, + scripts::{ + functions::{array::cosine_similarity, unicode::CharUtils}, + IsMixedCharset, + }, Server, }; use mail_parser::{HeaderName, MimeHeaders, PartType}; use nlp::tokenizers::types::TokenType; -use unicode_security::MixedScript; use crate::{SpamFilterContext, TextPart}; @@ -304,8 +306,8 @@ impl SpamFilterAnalyzeMime for Server { || ctx.input.message.html_body.contains(&part_id) }) .map_or(false, |p| match p { - TextPart::Plain { text_body, .. } => !text_body.is_single_script(), - TextPart::Html { text_body, .. } => !text_body.is_single_script(), + TextPart::Plain { text_body, .. } => text_body.is_mixed_charset(), + TextPart::Html { text_body, .. } => text_body.is_mixed_charset(), TextPart::None => false, }) { diff --git a/crates/spam-filter/src/analysis/url.rs b/crates/spam-filter/src/analysis/url.rs index a1680dcd..158bc6a9 100644 --- a/crates/spam-filter/src/analysis/url.rs +++ b/crates/spam-filter/src/analysis/url.rs @@ -10,11 +10,11 @@ use std::{borrow::Cow, future::Future, time::Duration}; use common::config::spamfilter::{Element, IpResolver, Location}; use common::scripts::functions::unicode::CharUtils; +use common::scripts::IsMixedCharset; use common::Server; use hyper::{header::LOCATION, Uri}; use nlp::tokenizers::types::TokenType; use reqwest::redirect::Policy; -use unicode_security::MixedScript; use crate::modules::dnsbl::check_dnsbl; use crate::modules::expression::StringResolver; @@ -244,7 +244,7 @@ impl SpamFilterAnalyzeUrl for Server { } } - if !host.fqdn.is_single_script() { + if host.fqdn.is_mixed_charset() { ctx.result.add_tag("MIXED_CHARSET_URL"); } } diff --git a/tests/resources/smtp/antispam/combined.test b/tests/resources/smtp/antispam/combined.test index c23f5b0a..1711d3e7 100644 --- a/tests/resources/smtp/antispam/combined.test +++ b/tests/resources/smtp/antispam/combined.test @@ -1021,8 +1021,8 @@ dmarc.result pass dmarc.policy reject remote_ip 173.224.123.255 tls.version TLS1_2 -expect_header X-Spam-Result: DMARC_POLICY_ALLOW (-0.50), DKIM_ALLOW (-0.20), SPF_ALLOW (-0.20), ARC_NA (0.00), DKIM_SIGNED (0.00), FROM_EQ_ENV_FROM (0.00), FROM_HAS_DN (0.00), HAS_EXTERNAL_IMG (0.00), HAS_REPLYTO (0.00), HAS_X_PRIO_THREE (0.00), HTML_SHORT_1 (0.00), RCPT_COUNT_ONE (0.00), REPLYTO_DN_EQ_FROM_DN (0.00), REPLYTO_DOM_EQ_FROM_DOM (0.00), TO_DN_ALL (0.00), TO_EQ_FROM (0.00), RCVD_COUNT_ZERO (0.10), RCVD_NO_TLS_LAST (0.10), HELO_NORES_A_OR_MX (0.30), MID_RHS_NOT_FQDN (0.50), UNPARSABLE_URL (0.50), FROMHOST_NORES_A_OR_MX (1.50), DIRECT_TO_MX (2.00), FORGED_RECIPIENTS (2.00), SUBJ_ALL_CAPS (3.00) -expect_header X-Spam-Status: Yes, score=9.10 +expect_header X-Spam-Result: DMARC_POLICY_ALLOW (-0.50), DKIM_ALLOW (-0.20), SPF_ALLOW (-0.20), ARC_NA (0.00), DKIM_SIGNED (0.00), FROM_EQ_ENV_FROM (0.00), FROM_HAS_DN (0.00), HAS_EXTERNAL_IMG (0.00), HAS_REPLYTO (0.00), HAS_X_PRIO_THREE (0.00), HTML_SHORT_1 (0.00), RCPT_COUNT_ONE (0.00), REPLYTO_DN_EQ_FROM_DN (0.00), REPLYTO_DOM_EQ_FROM_DOM (0.00), TO_DN_ALL (0.00), TO_EQ_FROM (0.00), RCVD_COUNT_ZERO (0.10), RCVD_NO_TLS_LAST (0.10), HELO_NORES_A_OR_MX (0.30), MID_RHS_NOT_FQDN (0.50), UNPARSABLE_URL (0.50), DATE_IN_PAST (1.00), FROMHOST_NORES_A_OR_MX (1.50), DIRECT_TO_MX (2.00), FORGED_RECIPIENTS (2.00), SUBJ_ALL_CAPS (3.00) +expect_header X-Spam-Status: Yes, score=10.10 Return-Path: DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; s=default; d=landeray.com; diff --git a/tests/resources/smtp/antispam/url.test b/tests/resources/smtp/antispam/url.test index a807dae5..9aa2dc2b 100644 --- a/tests/resources/smtp/antispam/url.test +++ b/tests/resources/smtp/antispam/url.test @@ -26,7 +26,7 @@ expect MIXED_CHARSET_URL Subject: test -my site is https://www.xn--80ak6aa92e.com/ +my site is https://www.xn--1ca81o6aa92e.com/ expect UNPARSABLE_URL