Port Spam filter to Rust - part 5

2025-10-06 02:34:43 +08:00 · 2024-12-11 17:57:34 +01:00 · 2024-12-11 17:57:34 +01:00 · b5696c2d26
commit b5696c2d26
parent 44ae796d9b
23 changed files with 1017 additions and 459 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -6450,6 +6450,7 @@ dependencies = [
 "decancer",
 "hyper 1.5.1",
 "idna 1.0.3",
+ "infer 0.16.0",
 "mail-auth",
 "mail-builder",
 "mail-parser",
@ -6457,6 +6458,8 @@ dependencies = [
 "nlp",
 "psl",
 "reqwest 0.12.9",
+ "sha1",
+ "sha2 0.10.8",
 "smtp-proto",
 "store",
 "tokio",
--- a/crates/common/src/config/spamfilter.rs
+++ b/crates/common/src/config/spamfilter.rs
@ -4,11 +4,15 @@
 * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
 */

-use std::time::Duration;
+use std::{net::SocketAddr, time::Duration};

 use ahash::AHashSet;
+use hyper::HeaderMap;
 use mail_parser::HeaderName;
-use utils::{config::Config, glob::GlobSet};
+use utils::{
+    config::Config,
+    glob::{GlobMap, GlobSet},
+};

 use super::if_block::IfBlock;

@ -19,16 +23,60 @@ pub struct SpamFilterConfig {
    pub max_rbl_email_checks: usize,
    pub max_rbl_url_checks: usize,

+    pub greylist_duration: Option<Duration>,
+
+    pub pyzor: Option<PyzorConfig>,
+    pub asn: AsnLookupProvider,
+
    pub list_dmarc_allow: GlobSet,
    pub list_spf_dkim_allow: GlobSet,
    pub list_freemail_providers: GlobSet,
    pub list_disposable_providers: GlobSet,
    pub list_trusted_domains: GlobSet,
    pub list_url_redirectors: GlobSet,
+    pub list_file_extensions: GlobMap<FileExtension>,
+
    pub remote_lists: Vec<RemoteListConfig>,
    pub dnsbls: Vec<DnsblConfig>,
 }

+#[derive(Debug, Clone, Default)]
+pub enum AsnLookupProvider {
+    Dns {
+        ipv4_zone: String,
+        ipv6_zone: String,
+        separator: char,
+        asn_index: usize,
+        country_index: Option<usize>,
+    },
+    Rest {
+        api: String,
+        timeout: Duration,
+        headers: HeaderMap,
+        asn_path: Vec<String>,
+        country_path: Option<Vec<String>>,
+    },
+    #[default]
+    None,
+}
+
+#[derive(Debug, Clone)]
+pub struct PyzorConfig {
+    pub address: SocketAddr,
+    pub timeout: Duration,
+    pub min_count: u64,
+    pub min_wl_count: u64,
+    pub ratio: f64,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct FileExtension {
+    pub known_types: AHashSet<String>,
+    pub is_bad: bool,
+    pub is_archive: bool,
+    pub is_nz: bool,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum Element {
    Url,
--- a/crates/common/src/scripts/functions/array.rs
+++ b/crates/common/src/scripts/functions/array.rs
@ -82,6 +82,32 @@ pub fn fn_cosine_similarity<'x>(_: &'x Context<'x>, v: Vec<Variable>) -> Variabl
    .into()
 }

+pub fn cosine_similarity(a: &[&str], b: &[&str]) -> f64 {
+    let mut word_freq: HashMap<&str, [u32; 2]> = HashMap::new();
+
+    for (idx, items) in [a, b].into_iter().enumerate() {
+        for item in items {
+            word_freq.entry(item).or_insert([0, 0])[idx] += 1;
+        }
+    }
+
+    let mut dot_product = 0;
+    let mut magnitude_a = 0;
+    let mut magnitude_b = 0;
+
+    for (_word, count) in word_freq.iter() {
+        dot_product += count[0] * count[1];
+        magnitude_a += count[0] * count[0];
+        magnitude_b += count[1] * count[1];
+    }
+
+    if magnitude_a != 0 && magnitude_b != 0 {
+        dot_product as f64 / (magnitude_a as f64).sqrt() / (magnitude_b as f64).sqrt()
+    } else {
+        0.0
+    }
+}
+
 pub fn fn_jaccard_similarity<'x>(_: &'x Context<'x>, v: Vec<Variable>) -> Variable {
    let mut word_freq = [HashSet::new(), HashSet::new()];

--- a/crates/smtp/src/inbound/rcpt.rs
+++ b/crates/smtp/src/inbound/rcpt.rs
@ -291,6 +291,70 @@ impl<T: SessionStream> Session<T> {
        }

        if self.is_allowed().await {
+            // Greylist
+            if let Some(greylist_duration) = self
+                .server
+                .core
+                .spam
+                .greylist_duration
+                .filter(|_| self.data.authenticated_as.is_none())
+            {
+                let key = format!(
+                    "g:{}:{}:{}",
+                    self.data.remote_ip_str,
+                    self.data.mail_from.as_ref().unwrap().address_lcase,
+                    self.data.rcpt_to.last().unwrap().address_lcase
+                );
+                match self
+                    .server
+                    .lookup_store()
+                    .key_exists(key.clone().into_bytes())
+                    .await
+                {
+                    Ok(true) => (),
+                    Ok(false) => {
+                        match self
+                            .server
+                            .lookup_store()
+                            .key_set(key.into_bytes(), vec![], greylist_duration.as_secs().into())
+                            .await
+                        {
+                            Ok(_) => {
+                                let rcpt = self.data.rcpt_to.pop().unwrap();
+
+                                trc::event!(
+                                    Smtp(SmtpEvent::RcptToGreylisted),
+                                    SpanId = self.data.session_id,
+                                    To = rcpt.address_lcase,
+                                );
+
+                                return self
+                                    .write(
+                                        concat!(
+                                            "422 4.2.2 Greylisted, please try ",
+                                            "again in a few moments.\r\n"
+                                        )
+                                        .as_bytes(),
+                                    )
+                                    .await;
+                            }
+                            Err(err) => {
+                                trc::error!(err
+                                    .span_id(self.data.session_id)
+                                    .caused_by(trc::location!())
+                                    .details("Failed to set greylist."));
+                            }
+                        }
+                    }
+                    Err(err) => {
+                        trc::error!(err
+                            .span_id(self.data.session_id)
+                            .caused_by(trc::location!())
+                            .details("Failed to check greylist."));
+                    }
+                }
+            }
+
            trc::event!(
                Smtp(SmtpEvent::RcptTo),
                SpanId = self.data.session_id,
--- a/crates/spam-filter/Cargo.toml
+++ b/crates/spam-filter/Cargo.toml
@ -15,12 +15,16 @@ mail-parser = { version = "0.9", features = ["full_encoding", "ludicrous_mode"]
 mail-builder = { version = "0.3", features = ["ludicrous_mode"] }
 mail-auth = { version = "0.5" }
 mail-send = { version = "0.4", default-features = false, features = ["cram-md5", "ring", "tls12"] }
+tokio = { version = "1.23", features = ["net", "macros"] }
 psl = "2"
 hyper = { version = "1.0.1", features = ["server", "http1", "http2"] }
 idna = "1.0"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-webpki-roots", "http2", "stream"]}
 decancer = "3.0.1"
 unicode-security = "0.1.0"
+infer = "0.16"
+sha1 = "0.10"
+sha2 = "0.10.6"

 [features]
 test_mode = []
--- a/crates/spam-filter/src/analysis/html.rs
+++ b/crates/spam-filter/src/analysis/html.rs
@ -0,0 +1,323 @@
+use std::future::Future;
+
+use common::Server;
+use hyper::Uri;
+use mail_parser::MimeHeaders;
+use nlp::tokenizers::types::{TokenType, TypesTokenizer};
+
+use crate::{modules::html::*, Hostname, SpamFilterContext, TextPart};
+
+pub trait SpamFilterAnalyzeHtml: Sync + Send {
+    fn spam_filter_analyze_html(
+        &self,
+        ctx: &mut SpamFilterContext<'_>,
+    ) -> impl Future<Output = ()> + Send;
+}
+
+struct Href {
+    url_parsed: Option<Uri>,
+    host: Option<Hostname>,
+}
+
+impl SpamFilterAnalyzeHtml for Server {
+    async fn spam_filter_analyze_html(&self, ctx: &mut SpamFilterContext<'_>) {
+        // Message only has text/html MIME parts
+        if ctx.input.message.content_type().map_or(false, |ct| {
+            ct.ctype().eq_ignore_ascii_case("text")
+                && ct
+                    .subtype()
+                    .unwrap_or_default()
+                    .eq_ignore_ascii_case("html")
+        }) {
+            ctx.result.add_tag("MIME_HTML_ONLY");
+        }
+        let mut last_href: Option<Href> = None;
+        let mut html_img_words = 0;
+        let mut html_text_chars = 0;
+        let mut in_head: i32 = 0;
+        let mut in_body: i32 = 0;
+
+        for (part_id, part) in ctx.output.text_parts.iter().enumerate() {
+            let is_body_part = ctx.input.message.text_body.contains(&part_id)
+                || ctx.input.message.html_body.contains(&part_id);
+
+            let (html_tokens, tokens) = if let TextPart::Html {
+                html_tokens,
+                tokens,
+                ..
+            } = part
+            {
+                (html_tokens, tokens)
+            } else {
+                continue;
+            };
+            let mut has_link_to_img = false;
+
+            for token in html_tokens {
+                match token {
+                    HtmlToken::StartTag {
+                        name,
+                        attributes,
+                        is_self_closing,
+                    } => match *name {
+                        A => {
+                            if let Some(attr) = attributes.iter().find_map(|(attr, value)| {
+                                if *attr == HREF {
+                                    value.as_deref()
+                                } else {
+                                    None
+                                }
+                            }) {
+                                let url = attr.trim().to_lowercase();
+                                let url_parsed = url.parse::<Uri>().ok();
+                                let href = Href {
+                                    host: url_parsed
+                                        .as_ref()
+                                        .and_then(|uri| uri.host().map(Hostname::new)),
+                                    url_parsed,
+                                };
+
+                                if is_body_part
+                                    && attr.starts_with("data:")
+                                    && attr.contains(";base64,")
+                                {
+                                    // Has Data URI encoding
+                                    ctx.result.add_tag("HAS_DATA_URI");
+                                    if attr.contains("text/") {
+                                        //  Uses Data URI encoding to obfuscate plain or HTML in base64
+                                        ctx.result.add_tag("DATA_URI_OBFU");
+                                    }
+                                } else if href.host.as_ref().map_or(false, |h| h.ip.is_some()) {
+                                    // HTML anchor points to an IP address
+                                    ctx.result.add_tag("HTTP_TO_IP");
+                                }
+
+                                if !*is_self_closing {
+                                    last_href = Some(href);
+                                }
+                            }
+                        }
+                        IMG if is_body_part => {
+                            let mut img_width = 800;
+                            let mut img_height = 600;
+
+                            for (attr, value) in attributes {
+                                if let Some(value) =
+                                    value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty())
+                                {
+                                    let dimension = match *attr {
+                                        WIDTH => &mut img_width,
+                                        HEIGHT => &mut img_height,
+                                        SRC => {
+                                            let src = value.to_ascii_lowercase();
+                                            if src.starts_with("data:") && src.contains(";base64,")
+                                            {
+                                                // Has Data URI encoding
+                                                ctx.result.add_tag("Has Data URI encoding");
+                                            }
+                                            continue;
+                                        }
+                                        _ => {
+                                            continue;
+                                        }
+                                    };
+                                    if let Some(pct) = value.strip_suffix('%') {
+                                        if let Ok(pct) = pct.trim().parse::<u64>() {
+                                            *dimension = (*dimension * pct) / 100;
+                                        }
+                                    } else if let Ok(value) = value.parse::<u64>() {
+                                        *dimension = value;
+                                    }
+                                }
+                            }
+                            let dimensions = img_width + img_height;
+
+                            if last_href.is_some() && dimensions >= 210 {
+                                has_link_to_img = true;
+                            }
+
+                            if dimensions > 100 {
+                                // We assume that a single picture 100x200 contains approx 3 words of text
+                                html_img_words += dimensions / 100;
+                            }
+                        }
+                        META => {
+                            let mut has_equiv_refresh = false;
+                            let mut has_content_url = false;
+
+                            for (attr, value) in attributes {
+                                if let Some(value) =
+                                    value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty())
+                                {
+                                    if *attr == HTTP_EQUIV {
+                                        if value.eq_ignore_ascii_case("refresh") {
+                                            has_equiv_refresh = true;
+                                        }
+                                    } else if *attr == CONTENT
+                                        && value.to_ascii_lowercase().contains("url=")
+                                    {
+                                        has_content_url = true;
+                                    }
+                                }
+                            }
+
+                            if has_equiv_refresh && has_content_url {
+                                // HTML meta refresh tag
+                                ctx.result.add_tag("HTML_META_REFRESH_URL");
+                            }
+                        }
+                        LINK if is_body_part => {
+                            let mut has_rel_style = false;
+                            let mut has_href_css = false;
+
+                            for (attr, value) in attributes {
+                                if let Some(value) =
+                                    value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty())
+                                {
+                                    if *attr == REL {
+                                        if value.to_ascii_lowercase().contains("stylesheet") {
+                                            has_rel_style = true;
+                                        }
+                                    } else if *attr == HREF
+                                        && value.to_ascii_lowercase().ends_with(".css")
+                                    {
+                                        has_href_css = true;
+                                    }
+                                }
+                            }
+
+                            if has_rel_style || has_href_css {
+                                // Has external CSS
+                                ctx.result.add_tag("EXT_CSS");
+                            }
+                        }
+                        HEAD if !*is_self_closing => {
+                            in_head += 1;
+                        }
+                        BODY if !*is_self_closing => {
+                            in_body += 1;
+                        }
+                        _ => {}
+                    },
+                    HtmlToken::EndTag { name } => match *name {
+                        A => {
+                            last_href = None;
+                        }
+                        HEAD => {
+                            in_head -= 1;
+                        }
+                        BODY => {
+                            in_body -= 1;
+                        }
+                        _ => (),
+                    },
+                    HtmlToken::Text { text } if in_head > 0 => {
+                        if let Some((href_url, href_host)) = last_href
+                            .as_ref()
+                            .and_then(|href| Some((href.url_parsed.as_ref()?, href.host.as_ref()?)))
+                        {
+                            for token in TypesTokenizer::new(text.as_ref())
+                                .tokenize_numbers(false)
+                                .tokenize_urls(true)
+                                .tokenize_urls_without_scheme(true)
+                                .tokenize_emails(true)
+                            {
+                                let text_url = match token.word {
+                                    TokenType::Url(url) => url.to_lowercase(),
+                                    TokenType::UrlNoScheme(url) => {
+                                        format!("http://{}", url.to_lowercase())
+                                    }
+                                    _ => continue,
+                                };
+                                let text_url_parsed =
+                                    if let Ok(text_url_parsed) = text_url.parse::<Uri>() {
+                                        text_url_parsed
+                                    } else {
+                                        continue;
+                                    };
+
+                                if href_url.scheme().map(|s| s.as_str()).unwrap_or_default()
+                                    == "http"
+                                    && text_url_parsed
+                                        .scheme()
+                                        .map(|s| s.as_str())
+                                        .unwrap_or_default()
+                                        == "https"
+                                {
+                                    // The anchor text contains a distinct scheme compared to the target URL
+                                    ctx.result.add_tag("HTTP_TO_HTTPS");
+                                }
+
+                                if let Some(text_url_host) = text_url_parsed.host() {
+                                    let text_url_host = Hostname::new(text_url_host);
+
+                                    if text_url_host.sld_or_default() != href_host.sld_or_default()
+                                    {
+                                        // The anchor text contains a different domain than the target URL
+                                        ctx.result.add_tag("PHISHING");
+                                    }
+                                }
+                            }
+                        }
+
+                        if is_body_part {
+                            html_text_chars += text.chars().filter(|t| t.is_alphanumeric()).count();
+                        }
+                    }
+                    _ => (),
+                }
+            }
+
+            if is_body_part {
+                if in_head != 0 || in_body != 0 {
+                    // HTML tags are not properly closed
+                    ctx.result.add_tag("HTML_UNBALANCED_TAG");
+                }
+
+                if has_link_to_img {
+                    match html_text_chars {
+                        0..1024 => {
+                            ctx.result.add_tag("HTML_SHORT_LINK_IMG_1");
+                        }
+                        1024..1536 => {
+                            ctx.result.add_tag("HTML_SHORT_LINK_IMG_2");
+                        }
+                        1536..2048 => {
+                            ctx.result.add_tag("HTML_SHORT_LINK_IMG_3");
+                        }
+                        _ => (),
+                    }
+                }
+
+                let mut html_words = 0;
+                let mut html_uris = 0;
+
+                for token in tokens {
+                    match token {
+                        TokenType::Alphabetic(_)
+                        | TokenType::Alphanumeric(_)
+                        | TokenType::Email(_) => {
+                            html_words += 1;
+                        }
+                        TokenType::Url(_) | TokenType::UrlNoScheme(_) => {
+                            html_uris += 1;
+                        }
+                        _ => (),
+                    }
+                }
+
+                if (!has_link_to_img || html_text_chars >= 2048)
+                    && html_img_words as f64 / (html_words as f64 + html_img_words as f64) > 0.5
+                {
+                    // Message contains more images than text
+                    ctx.result.add_tag("HTML_TEXT_IMG_RATIO");
+                }
+
+                if html_uris > 0 && html_words == 0 {
+                    // Message only contains URIs in HTML
+                    ctx.result.add_tag("BODY_URI_ONLY");
+                }
+            }
+        }
+    }
+}
--- a/crates/spam-filter/src/analysis/ip.rs
+++ b/crates/spam-filter/src/analysis/ip.rs
@ -13,14 +13,14 @@ use crate::{modules::dnsbl::is_dnsbl, SpamFilterContext, TextPart};

 use super::{ElementLocation, SpamFilterResolver};

-pub trait SpamFilterAnalyzeIpRev: Sync + Send {
+pub trait SpamFilterAnalyzeIp: Sync + Send {
    fn spam_filter_analyze_ip(
        &self,
        ctx: &mut SpamFilterContext<'_>,
    ) -> impl Future<Output = ()> + Send;
 }

-impl SpamFilterAnalyzeIpRev for Server {
+impl SpamFilterAnalyzeIp for Server {
    async fn spam_filter_analyze_ip(&self, ctx: &mut SpamFilterContext<'_>) {
        // IP Address RBL
        let mut ips =
--- a/crates/spam-filter/src/analysis/mime.rs
+++ b/crates/spam-filter/src/analysis/mime.rs
@ -0,0 +1,426 @@
+use std::{collections::HashSet, future::Future, vec};
+
+use common::{
+    scripts::functions::{array::cosine_similarity, unicode::CharUtils},
+    Server,
+};
+use hyper::Uri;
+use mail_parser::{HeaderName, MimeHeaders, PartType};
+use nlp::tokenizers::types::TokenType;
+use unicode_security::MixedScript;
+
+use crate::{Hostname, SpamFilterContext, TextPart};
+
+pub trait SpamFilterAnalyzeMime: Sync + Send {
+    fn spam_filter_analyze_mime(
+        &self,
+        ctx: &mut SpamFilterContext<'_>,
+    ) -> impl Future<Output = ()> + Send;
+}
+
+impl SpamFilterAnalyzeMime for Server {
+    async fn spam_filter_analyze_mime(&self, ctx: &mut SpamFilterContext<'_>) {
+        let mut has_mime_version = false;
+        let mut has_ct = false;
+        let mut has_cte = false;
+        let mut had_cd = false;
+        let mut is_plain_text = false;
+
+        for header in ctx.input.message.headers() {
+            match &header.name {
+                HeaderName::MimeVersion => {
+                    if ctx
+                        .input
+                        .message
+                        .raw_message()
+                        .get(header.offset_field..header.offset_start - 1)
+                        != Some(b"MIME-Version")
+                    {
+                        ctx.result.add_tag("MV_CASE");
+                    }
+                    has_mime_version = true;
+                }
+                HeaderName::ContentType => {
+                    has_ct = true;
+                    is_plain_text = header.value().as_content_type().map_or(false, |ct| {
+                        ct.ctype().eq_ignore_ascii_case("text")
+                            && ct
+                                .subtype()
+                                .unwrap_or_default()
+                                .eq_ignore_ascii_case("plain")
+                    });
+                }
+                HeaderName::ContentTransferEncoding => {
+                    has_cte = true;
+                }
+                HeaderName::ContentDisposition => {
+                    had_cd = true;
+                }
+                _ => (),
+            }
+        }
+
+        if !has_mime_version && (has_ct || has_cte) {
+            ctx.result.add_tag("MISSING_MIME_VERSION");
+        }
+        if has_ct && !is_plain_text && !has_cte && !had_cd {
+            // Only Content-Type header without other MIME headers
+            ctx.result.add_tag("MIME_HEADER_CTYPE_ONLY");
+        }
+        let raw_message = ctx.input.message.raw_message();
+
+        let mut has_text_part = false;
+        let mut is_encrypted = false;
+        let mut is_encrypted_smime = false;
+        let mut is_encrypted_pgp = false;
+
+        let mut num_parts = 0;
+        let mut num_parts_size = 0;
+
+        for (part_id, part) in ctx.input.message.parts.iter().enumerate() {
+            let mut ct = None;
+            let mut cd = None;
+            let mut ct_type = String::new();
+            let mut ct_subtype = String::new();
+            let mut cte = String::new();
+            let mut is_attachment = ctx.input.message.attachments.contains(&part_id);
+            let mut has_content_id = false;
+
+            for header in part.headers() {
+                match &header.name {
+                    HeaderName::ContentType => {
+                        if let Some(ct_) = header.value().as_content_type() {
+                            ct_type = ct_.ctype().to_ascii_lowercase();
+                            ct_subtype = ct_.subtype().unwrap_or_default().to_ascii_lowercase();
+                            ct = Some(ct_);
+                        }
+
+                        if ct_type.is_empty() {
+                            // Content-Type header can't be parsed
+                            ctx.result.add_tag("BROKEN_CONTENT_TYPE");
+                        }
+
+                        if raw_message
+                            .get(header.offset_start..header.offset_end)
+                            .and_then(|s| s.trim_ascii_end().last())
+                            == Some(&b';')
+                        {
+                            // Content-Type header ends with a semi-colon
+                            ctx.result.add_tag("CT_EXTRA_SEMI");
+                        }
+                    }
+                    HeaderName::ContentTransferEncoding => {
+                        let cte_ = header.value().as_text().unwrap_or_default();
+                        cte = cte_.to_ascii_lowercase();
+
+                        if cte != cte_ {
+                            ctx.result.add_tag("CTE_CASE");
+                        }
+                    }
+                    HeaderName::ContentDisposition => {
+                        cd = header.value().as_content_type();
+                    }
+                    HeaderName::ContentId => {
+                        has_content_id = true;
+                    }
+                    _ => (),
+                }
+            }
+
+            match ct_type.as_str() {
+                "multipart" => {
+                    let part_ids = match &part.body {
+                        PartType::Multipart(parts) => parts.as_slice(),
+                        _ => &[],
+                    };
+
+                    match ct_subtype.as_str() {
+                        "alternative" => {
+                            let mut has_plain_part = false;
+                            let mut has_html_part = false;
+
+                            let mut text_part_words = vec![];
+                            let mut text_part_uris = 0;
+
+                            let mut html_part_words = vec![];
+                            let mut html_part_uris = 0;
+
+                            for text_part in part_ids.iter().map(|id| &ctx.output.text_parts[*id]) {
+                                match text_part {
+                                    TextPart::Plain { tokens, .. } if !has_plain_part => {
+                                        words_and_uris(
+                                            tokens,
+                                            &mut text_part_words,
+                                            &mut text_part_uris,
+                                        );
+                                        has_plain_part = true;
+                                    }
+                                    TextPart::Html { tokens, .. } if !has_html_part => {
+                                        words_and_uris(
+                                            tokens,
+                                            &mut html_part_words,
+                                            &mut html_part_uris,
+                                        );
+                                        has_html_part = true;
+                                    }
+                                    _ => (),
+                                }
+                            }
+
+                            //  Multipart message mostly text/html MIME
+                            if has_html_part {
+                                if !has_plain_part {
+                                    ctx.result.add_tag("MIME_MA_MISSING_TEXT");
+                                }
+                            } else if has_plain_part {
+                                ctx.result.add_tag("MIME_MA_MISSING_HTML");
+                            }
+
+                            // HTML and text parts are different
+                            if has_plain_part
+                                && has_html_part
+                                && (!text_part_words.is_empty() || !html_part_words.is_empty())
+                                && cosine_similarity(&text_part_words, &html_part_words) < 0.95
+                            {
+                                ctx.result.add_tag("R_PARTS_DIFFER");
+                            }
+
+                            // Odd URI count between parts
+                            if text_part_uris != html_part_uris {
+                                ctx.result.add_tag("URI_COUNT_ODD");
+                            }
+                        }
+                        "mixed" => {
+                            let mut num_text_parts = 0;
+                            let mut has_other_parts = false;
+
+                            for (sub_part_id, sub_part) in part_ids
+                                .iter()
+                                .map(|id| (*id, &ctx.input.message.parts[*id]))
+                            {
+                                let ctype = sub_part
+                                    .content_type()
+                                    .map(|ct| ct.ctype())
+                                    .unwrap_or_default();
+
+                                if ctype.eq_ignore_ascii_case("text")
+                                    && !ctx.input.message.attachments.contains(&sub_part_id)
+                                {
+                                    num_text_parts += 1;
+                                } else if !ctype.eq_ignore_ascii_case("multipart") {
+                                    has_other_parts = true;
+                                }
+                            }
+
+                            // Found multipart/mixed without non-textual part
+                            if !has_other_parts && num_text_parts < 3 {
+                                ctx.result.add_tag("CTYPE_MIXED_BOGUS");
+                            }
+                        }
+                        "encrypted" => {
+                            is_encrypted = true;
+                        }
+                        _ => (),
+                    }
+
+                    continue;
+                }
+                "text" => {
+                    let mut is_7bit = false;
+                    match cte.as_str() {
+                        "" | "7bit" => {
+                            if raw_message
+                                .get(part.raw_body_offset()..part.raw_end_offset())
+                                .map_or(false, |bytes| !bytes.is_ascii())
+                            {
+                                // MIME text part claims to be ASCII but isn't
+                                ctx.result.add_tag("R_BAD_CTE_7BIT");
+                            }
+                            is_7bit = true;
+                        }
+                        "base64" => {
+                            if part.contents().is_ascii() {
+                                // Has text part encoded in base64 that does not contain any 8bit characters
+                                ctx.result.add_tag("MIME_BASE64_TEXT_BOGUS");
+                            } else {
+                                // Has text part encoded in base64
+                                ctx.result.add_tag("MIME_BASE64_TEXT");
+                            }
+                        }
+                        _ => (),
+                    }
+
+                    if !is_7bit
+                        && ct_subtype == "plain"
+                        && ct
+                            .and_then(|ct| ct.attribute("charset"))
+                            .map_or(true, |c| c.is_empty())
+                    {
+                        // Charset header is missing
+                        ctx.result.add_tag("R_MISSING_CHARSET");
+                    }
+
+                    match &part.body {
+                        PartType::Text(text) | PartType::Html(text)
+                            if ctx.input.message.text_body.contains(&part_id)
+                                || ctx.input.message.html_body.contains(&part_id) =>
+                        {
+                            if !text.as_ref().is_single_script() {
+                                // Text part contains multiple scripts
+                                ctx.result.add_tag("R_MIXED_CHARSET");
+                            }
+                        }
+                        _ => (),
+                    }
+
+                    has_text_part = true;
+                }
+                "application" => match ct_subtype.as_str() {
+                    "pkcs7-mime" => {
+                        ctx.result.add_tag("ENCRYPTED_SMIME");
+                        is_attachment = false;
+                        is_encrypted_smime = true;
+                    }
+                    "pkcs7-signature" => {
+                        ctx.result.add_tag("SIGNED_SMIME");
+                        is_attachment = false;
+                    }
+                    "pgp-encrypted" => {
+                        ctx.result.add_tag("ENCRYPTED_PGP");
+                        is_attachment = false;
+                        is_encrypted_pgp = true;
+                    }
+                    "pgp-signature" => {
+                        ctx.result.add_tag("SIGNED_PGP");
+                        is_attachment = false;
+                    }
+                    "octet-stream" => {
+                        if !is_encrypted
+                            && !has_content_id
+                            && cd.map_or(true, |cd| {
+                                cd.attribute("type")
+                                    .unwrap_or_default()
+                                    .to_ascii_lowercase()
+                                    != "attachment"
+                                    && !cd.has_attribute("filename")
+                            })
+                        {
+                            ctx.result.add_tag("CTYPE_MISSING_DISPOSITION");
+                        }
+                    }
+                    _ => (),
+                },
+                _ => (),
+            }
+
+            num_parts += 1;
+            num_parts_size += part.len();
+
+            let ct_full = format!("{ct_type}/{ct_subtype}");
+
+            if is_attachment {
+                // Has a MIME attachment
+                ctx.result.add_tag("HAS_ATTACHMENT");
+                match &part.body {
+                    PartType::Binary(bytes) | PartType::InlineBinary(bytes) => {
+                        if let Some(t) = infer::get(bytes.as_ref()) {
+                            if t.mime_type() != ct_full {
+                                // Known content-type
+                                ctx.result.add_tag("MIME_GOOD");
+                            } else if ct_full != "application/octet-stream" {
+                                // Known bad content-type
+                                ctx.result.add_tag("MIME_BAD");
+                            }
+                        }
+                    }
+
+                    _ => (),
+                }
+            }
+
+            // Analyze attachment name
+            if let Some(attach_name) = part.attachment_name() {
+                if attach_name.chars().any(|c| c.is_obscured()) {
+                    // Attachment name contains zero-width space
+                    ctx.result.add_tag("MIME_BAD_UNICODE");
+                }
+                let attach_name = attach_name.trim().to_lowercase();
+                if let Some((name, ext)) = attach_name.rsplit_once('.').and_then(|(name, ext)| {
+                    Some((name, self.core.spam.list_file_extensions.get(ext)?))
+                }) {
+                    let sub_ext = name
+                        .rsplit_once('.')
+                        .and_then(|(_, ext)| self.core.spam.list_file_extensions.get(ext));
+
+                    if ext.is_bad {
+                        // Attachment has a bad extension
+                        if sub_ext.map_or(false, |e| e.is_bad) {
+                            ctx.result.add_tag("MIME_DOUBLE_BAD_EXTENSION");
+                        } else {
+                            ctx.result.add_tag("MIME_BAD_EXTENSION");
+                        }
+                    }
+
+                    if ext.is_archive && sub_ext.map_or(false, |e| e.is_archive) {
+                        // Archive in archive
+                        ctx.result.add_tag("MIME_ARCHIVE_IN_ARCHIVE");
+                    }
+
+                    if !ext.known_types.is_empty()
+                        && ct_full != "application/octet-stream"
+                        && !ext.known_types.contains(&ct_full)
+                    {
+                        // Invalid attachment mime type
+                        ctx.result.add_tag("MIME_BAD_ATTACHMENT");
+                    }
+                }
+            }
+        }
+
+        match num_parts_size {
+            0 => {
+                // Message contains no parts
+                ctx.result.add_tag("COMPLETELY_EMPTY");
+            }
+            1..64 if num_parts == 1 => {
+                // Message contains only one short part
+                ctx.result.add_tag("SINGLE_SHORT_PART");
+            }
+            _ => (),
+        }
+
+        if has_text_part && (is_encrypted_pgp || is_encrypted_smime) {
+            // Message contains both text and encrypted parts
+            ctx.result.add_tag("BOGUS_ENCRYPTED_AND_TEXT");
+        }
+    }
+}
+
+fn words_and_uris<'x, T: AsRef<str>>(
+    tokens: &'x [TokenType<T>],
+    words: &mut Vec<&'x str>,
+    uri_count: &mut usize,
+) {
+    let mut uris = HashSet::new();
+
+    for token in tokens {
+        match token {
+            TokenType::Alphabetic(v) | TokenType::Alphanumeric(v) => {
+                words.push(v.as_ref());
+            }
+            TokenType::Url(v) => {
+                if let Some(host) = v
+                    .as_ref()
+                    .parse::<Uri>()
+                    .ok()
+                    .and_then(|uri| uri.host().map(Hostname::new))
+                {
+                    uris.insert(host.sld.unwrap_or(host.fqdn));
+                }
+            }
+            _ => (),
+        }
+    }
+
+    *uri_count = uris.len();
+}
--- a/crates/spam-filter/src/analysis/mod.rs
+++ b/crates/spam-filter/src/analysis/mod.rs
@ -19,9 +19,12 @@ pub mod domain;
 pub mod ehlo;
 pub mod from;
 pub mod headers;
+pub mod html;
 pub mod init;
 pub mod ip;
 pub mod messageid;
+pub mod mime;
+pub mod pyzor;
 pub mod received;
 pub mod recipient;
 pub mod replyto;
--- a/crates/spam-filter/src/analysis/pyzor.rs
+++ b/crates/spam-filter/src/analysis/pyzor.rs
@ -0,0 +1,35 @@
+use std::future::Future;
+
+use common::Server;
+
+use crate::{modules::pyzor::pyzor_check, SpamFilterContext};
+
+pub trait SpamFilterAnalyzePyzor: Sync + Send {
+    fn spam_filter_analyze_pyzor(
+        &self,
+        ctx: &mut SpamFilterContext<'_>,
+    ) -> impl Future<Output = ()> + Send;
+}
+
+impl SpamFilterAnalyzePyzor for Server {
+    async fn spam_filter_analyze_pyzor(&self, ctx: &mut SpamFilterContext<'_>) {
+        if let Some(config) = &self.core.spam.pyzor {
+            match pyzor_check(ctx.input.message, config).await {
+                Ok(Some(result)) => {
+                    if result.code == 200
+                        && result.count > config.min_count
+                        && (result.wl_count < config.min_wl_count
+                            || (result.wl_count as f64 / result.count as f64) < config.ratio)
+                    {
+                        ctx.result.add_tag("PYZOR");
+                    }
+                    let todo = "log time";
+                }
+                Ok(None) => {}
+                Err(err) => {
+                    trc::error!(err.span_id(ctx.input.span_id));
+                }
+            }
+        }
+    }
+}
--- a/crates/spam-filter/src/analysis/url.rs
+++ b/crates/spam-filter/src/analysis/url.rs
@ -438,7 +438,7 @@ fn is_single_html_url<T: AsRef<str>>(html_tokens: &[HtmlToken], tokens: &[TokenT
    url_count = 0;

    for token in html_tokens {
-        if matches!(token, HtmlToken::StartTag { name, attributes } if *name == A && attributes.iter().any(|(k, _)| *k == HREF))
+        if matches!(token, HtmlToken::StartTag { name, attributes, .. } if *name == A && attributes.iter().any(|(k, _)| *k == HREF))
        {
            url_count += 1;
        }
--- a/crates/spam-filter/src/modules/html.rs
+++ b/crates/spam-filter/src/modules/html.rs
@ -5,6 +5,7 @@ pub enum HtmlToken {
    StartTag {
        name: u64,
        attributes: Vec<(u64, Option<String>)>,
+        is_self_closing: bool,
    },
    EndTag {
        name: u64,
@ -18,10 +19,46 @@ pub enum HtmlToken {
 }

 pub(crate) const A: u64 = b'a' as u64;
+pub(crate) const IMG: u64 = (b'i' as u64) | (b'm' as u64) << 8 | (b'g' as u64) << 16;
+pub(crate) const HEAD: u64 =
+    (b'h' as u64) | (b'e' as u64) << 8 | (b'a' as u64) << 16 | (b'd' as u64) << 24;
+pub(crate) const BODY: u64 =
+    (b'b' as u64) | (b'o' as u64) << 8 | (b'd' as u64) << 16 | (b'y' as u64) << 24;
+pub(crate) const META: u64 =
+    (b'm' as u64) | (b'e' as u64) << 8 | (b't' as u64) << 16 | (b'a' as u64) << 24;
+pub(crate) const LINK: u64 =
+    (b'l' as u64) | (b'i' as u64) << 8 | (b'n' as u64) << 16 | (b'k' as u64) << 24;

 pub(crate) const HREF: u64 =
    (b'h' as u64) | (b'r' as u64) << 8 | (b'e' as u64) << 16 | (b'f' as u64) << 24;
 pub(crate) const SRC: u64 = (b's' as u64) | (b'r' as u64) << 8 | (b'c' as u64) << 16;
+pub(crate) const WIDTH: u64 = (b'w' as u64)
+    | (b'i' as u64) << 8
+    | (b'd' as u64) << 16
+    | (b't' as u64) << 24
+    | (b'h' as u64) << 32;
+pub(crate) const HEIGHT: u64 = (b'h' as u64)
+    | (b'e' as u64) << 8
+    | (b'i' as u64) << 16
+    | (b'g' as u64) << 24
+    | (b'h' as u64) << 32
+    | (b't' as u64) << 40;
+pub(crate) const REL: u64 = (b'r' as u64) | (b'e' as u64) << 8 | (b'l' as u64) << 16;
+pub(crate) const CONTENT: u64 = (b'c' as u64)
+    | (b'o' as u64) << 8
+    | (b'n' as u64) << 16
+    | (b't' as u64) << 24
+    | (b'e' as u64) << 32
+    | (b'n' as u64) << 40
+    | (b't' as u64) << 48;
+pub(crate) const HTTP_EQUIV: u64 = (b'h' as u64)
+    | (b't' as u64) << 8
+    | (b't' as u64) << 16
+    | (b'p' as u64) << 24
+    | (b'-' as u64) << 32
+    | (b'e' as u64) << 40
+    | (b'q' as u64) << 48
+    | (b'u' as u64) << 56;

 pub fn html_to_tokens(input: &str) -> Vec<HtmlToken> {
    let input = input.as_bytes();
@ -106,6 +143,7 @@ pub fn html_to_tokens(input: &str) -> Vec<HtmlToken> {
                    }

                    let mut in_quote = false;
+                    let mut is_self_closing = false;

                    let mut key: u64 = 0;
                    let mut shift = 0;
@ -123,6 +161,9 @@ pub fn html_to_tokens(input: &str) -> Vec<HtmlToken> {
                                key |= ((ch - b'A' + b'a') as u64) << shift;
                                shift += 8;
                            }
+                            b'/' if !in_quote => {
+                                is_self_closing = true;
+                            }
                            b'>' if !in_quote => {
                                if shift != 0 {
                                    if tag == 0 {
@ -205,6 +246,7 @@ pub fn html_to_tokens(input: &str) -> Vec<HtmlToken> {
                            tags.push(HtmlToken::StartTag {
                                name: tag,
                                attributes,
+                                is_self_closing,
                            });
                        }
                    }
@ -292,7 +334,8 @@ mod tests {
            tokens,
            vec![HtmlToken::StartTag {
                name: 7760228,
-                attributes: vec![]
+                attributes: vec![],
+                is_self_closing: false
            }]
        );
    }
@ -325,14 +368,16 @@ mod tests {
            vec![
                HtmlToken::StartTag {
                    name: 7760228,
-                    attributes: vec![]
+                    attributes: vec![],
+                    is_self_closing: false
                },
                HtmlToken::Text {
                    text: "Hello,".to_string()
                },
                HtmlToken::StartTag {
                    name: 1851879539,
-                    attributes: vec![]
+                    attributes: vec![],
+                    is_self_closing: false
                },
                HtmlToken::Text {
                    text: " \" world \"".to_string()
@ -358,15 +403,18 @@ mod tests {
                    attributes: vec![
                        (1701869940, Some("text".to_string())),
                        (435761734006, Some("test".to_string()))
-                    ]
+                    ],
+                    is_self_closing: false
                },
                HtmlToken::StartTag {
                    name: 111516266162547,
-                    attributes: vec![]
+                    attributes: vec![],
+                    is_self_closing: true
                },
                HtmlToken::StartTag {
                    name: 6647407,
-                    attributes: vec![(1920234593, None)]
+                    attributes: vec![(1920234593, None)],
+                    is_self_closing: true
                },
                HtmlToken::StartTag {
                    name: 97,
@ -374,7 +422,8 @@ mod tests {
                        (98, Some("1".to_string())),
                        (98, None),
                        (99, Some("123".to_string()))
-                    ]
+                    ],
+                    is_self_closing: false
                }
            ]
        );
--- a/crates/spam-filter/src/modules/mod.rs
+++ b/crates/spam-filter/src/modules/mod.rs
@ -1,4 +1,5 @@
 pub mod dnsbl;
 pub mod html;
+pub mod pyzor;
 pub mod remote_list;
 pub mod sanitize;
--- a/crates/spam-filter/src/modules/pyzor.rs
+++ b/crates/spam-filter/src/modules/pyzor.rs
@ -4,16 +4,14 @@
 * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
 */

-use sieve::{runtime::Variable, FunctionMap};
-
-use super::PluginContext;
-
 use std::{
    borrow::Cow,
    io::Write,
+    net::SocketAddr,
    time::{Duration, SystemTime},
 };

+use common::config::spamfilter::PyzorConfig;
 use mail_parser::{decoders::html::add_html_token, Message, PartType};
 use nlp::tokenizers::types::{TokenType, TypesTokenizer};
 use sha1::{Digest, Sha1};
@ -24,29 +22,27 @@ const ATOMIC_NUM_LINES: usize = 4;
 const DIGEST_SPEC: &[(usize, usize)] = &[(20, 3), (60, 3)];

 #[derive(Default, Debug, PartialEq, Eq)]
-struct PyzorResponse {
-    code: u32,
-    count: u64,
-    wl_count: u64,
+pub(crate) struct PyzorResponse {
+    pub code: u32,
+    pub count: u64,
+    pub wl_count: u64,
 }

-pub fn register(plugin_id: u32, fnc_map: &mut FunctionMap) {
-    fnc_map.set_external_function("pyzor_check", plugin_id, 2);
-}
-
-pub async fn exec(ctx: PluginContext<'_>) -> trc::Result<Variable> {
+pub(crate) async fn pyzor_check(
+    message: &Message<'_>,
+    config: &PyzorConfig,
+) -> trc::Result<Option<PyzorResponse>> {
    // Make sure there is at least one text part
-    if !ctx
-        .message
+    if !message
        .parts
        .iter()
        .any(|p| matches!(p.body, PartType::Text(_) | PartType::Html(_)))
    {
-        return Ok(Variable::default());
+        return Ok(None);
    }

    // Hash message
-    let request = ctx.message.pyzor_check_message();
+    let request = message.pyzor_check_message();

    #[cfg(feature = "test_mode")]
    {
@ -74,35 +70,21 @@ pub async fn exec(ctx: PluginContext<'_>) -> trc::Result<Variable> {
        }
    }

-    let address = ctx.arguments[0].to_string();
-    let timeout = Duration::from_secs((ctx.arguments[1].to_integer() as u64).clamp(5, 60));
-
    // Send message to address
-    pyzor_send_message(address.as_ref(), timeout, &request)
+    pyzor_send_message(config.address, config.timeout, &request)
        .await
        .map(Into::into)
        .map_err(|err| {
            trc::SpamEvent::PyzorError
                .into_err()
-                .ctx(trc::Key::Url, address.to_string())
+                .ctx(trc::Key::Url, config.address.to_string())
                .reason(err)
                .details("Pyzor failed")
        })
 }

-impl From<PyzorResponse> for Variable {
-    fn from(response: PyzorResponse) -> Self {
-        vec![
-            Variable::from(response.code),
-            Variable::from(response.count),
-            Variable::from(response.wl_count),
-        ]
-        .into()
-    }
-}
-
 async fn pyzor_send_message(
-    addr: &str,
+    addr: SocketAddr,
    timeout: Duration,
    message: &str,
 ) -> std::io::Result<PyzorResponse> {
@ -451,7 +433,7 @@ mod test {
    async fn send_message() {
        assert_eq!(
            pyzor_send_message(
-                "public.pyzor.org:24441",
+                "public.pyzor.org:24441".parse().unwrap(),
                Duration::from_secs(10),
                concat!(
                    "Op: check\n",
--- a/crates/trc/src/event/description.rs
+++ b/crates/trc/src/event/description.rs
@ -442,6 +442,7 @@ impl SmtpEvent {
            SmtpEvent::RcptToDuplicate => "Duplicate RCPT TO",
            SmtpEvent::RcptToRewritten => "RCPT TO address rewritten",
            SmtpEvent::RcptToMissing => "RCPT TO address missing",
+            SmtpEvent::RcptToGreylisted => "RCPT TO greylisted",
            SmtpEvent::TooManyRecipients => "Too many recipients",
            SmtpEvent::TooManyInvalidRcpt => "Too many invalid recipients",
            SmtpEvent::RawInput => "Raw SMTP input received",
@ -552,6 +553,7 @@ impl SmtpEvent {
            }
            SmtpEvent::RcptToRewritten => "The envelope recipient address was rewritten",
            SmtpEvent::RcptToMissing => "The remote client issued a DATA command before RCPT TO",
+            SmtpEvent::RcptToGreylisted => "The recipient was greylisted",
            SmtpEvent::TooManyRecipients => {
                "The remote client exceeded the number of recipients allowed"
            }
--- a/crates/trc/src/event/level.rs
+++ b/crates/trc/src/event/level.rs
@ -186,6 +186,7 @@ impl EventType {
                | SmtpEvent::MailboxDoesNotExist
                | SmtpEvent::RelayNotAllowed
                | SmtpEvent::RcptTo
+                | SmtpEvent::RcptToGreylisted
                | SmtpEvent::TooManyInvalidRcpt
                | SmtpEvent::Vrfy
                | SmtpEvent::VrfyNotFound
--- a/crates/trc/src/lib.rs
+++ b/crates/trc/src/lib.rs
@ -393,6 +393,7 @@ pub enum SmtpEvent {
    RcptToDuplicate,
    RcptToRewritten,
    RcptToMissing,
+    RcptToGreylisted,
    TooManyRecipients,
    TooManyInvalidRcpt,
    RawInput,
--- a/crates/trc/src/serializers/binary.rs
+++ b/crates/trc/src/serializers/binary.rs
@ -866,6 +866,7 @@ impl EventType {
            EventType::Security(SecurityEvent::ScanBan) => 558,
            EventType::Store(StoreEvent::AzureError) => 559,
            EventType::TlsRpt(TlsRptEvent::RecordNotFound) => 560,
+            EventType::Smtp(SmtpEvent::RcptToGreylisted) => 561,
        }
    }

@ -1472,6 +1473,7 @@ impl EventType {
            558 => Some(EventType::Security(SecurityEvent::ScanBan)),
            559 => Some(EventType::Store(StoreEvent::AzureError)),
            560 => Some(EventType::TlsRpt(TlsRptEvent::RecordNotFound)),
+            561 => Some(EventType::Smtp(SmtpEvent::RcptToGreylisted)),
            _ => None,
        }
    }
--- a/resources/config/spamfilter/scripts/greylist.sieve
+++ b/resources/config/spamfilter/scripts/greylist.sieve
@ -1,9 +0,0 @@
-
-set "triplet" "g:${env.remote_ip}.${envelope.from}.${envelope.to}";
-
-if eval "!key_exists(SPAM_DB, triplet)" {
-    # Greylist sender for 30 days
-    eval "key_set(SPAM_DB, triplet, '', 2592000)";
-    reject "422 4.2.2 Greylisted, please try again in a few moments.";
-    stop;
-}
--- a/resources/config/spamfilter/scripts/html.sieve
+++ b/resources/config/spamfilter/scripts/html.sieve
@ -1,148 +0,0 @@
-
-# Message only has text/html MIME parts
-if eval "header.content-type == 'text/html'" {
-    let "t.MIME_HTML_ONLY" "1";
-} 
-
-foreverypart {
-    if eval "eq_ignore_case(header.content-type, 'text/html')" {
-        # Tokenize HTML
-        let "is_body_part" "is_body()";
-        let "html_tokens" "tokenize(part.text, 'html')";
-        let "html_tokens_len" "len(html_tokens)";
-        let "html_char_count" "0";
-        let "html_space_count" "0";
-        let "html_img_words" "0";
-        let "html_words" "0";
-        let "has_link_to_img" "0";
-        let "has_uri" "0";
-        let "has_text" "0";
-        let "in_head" "0";
-        let "in_body" "0";
-        let "in_anchor" "0";
-        let "in_anchor_href_ip" "0";
-        let "in_anchor_href" "";
-
-        let "i" "0";
-        while "i < html_tokens_len" {
-            let "token" "html_tokens[i]";
-            let "i" "i + 1";
-
-            # Tokens starting with '_' are text nodes
-            if eval "starts_with(token, '_')" {
-                if eval "in_head == 0" {
-                    let "html_char_count" "html_char_count + count_chars(token)";
-                    let "html_space_count" "html_space_count + count_spaces(token)";
-
-                    let "text" "to_lowercase(trim(strip_prefix(token, '_')))";
-                    let "html_words" "html_words + len(tokenize(text, 'words'))";
-
-                    let "uris" "tokenize(text, 'uri')";
-
-                    if eval "!is_empty(uris)" {
-                        let "has_uri" "1";
-                        let "uri" "uris[0]";
-
-                        if eval "in_anchor && !is_empty(in_anchor_href)" {
-                            if eval "contains(text, '://') &&
-                                    uri_part(uri, 'scheme') != uri_part(in_anchor_href, 'scheme')" {
-                                # The anchor text contains a distinct scheme compared to the target URL
-                                let "t.HTTP_TO_HTTPS" "1";
-                            }
-                            if eval "(!in_anchor_href_ip && (domain_part(uri_part(uri, 'host'), 'sld') != domain_part(uri_part(in_anchor_href, 'host'), 'sld'))) ||
-                                     (in_anchor_href_ip && (uri_part(uri, 'host') != uri_part(in_anchor_href, 'host')))" {
-                                let "t.PHISHING" "1";
-                            }
-                        }
-                    } elsif eval "!is_empty(text)" {
-                        let "has_text" "1";
-                    }
-                }
-            } elsif eval "starts_with(token, '<img')" {
-                if eval "is_body_part" {
-                    let "dimensions" "html_attr_size(token, 'width', 800) + html_attr_size(token, 'height', 600)";
-
-                    if eval "in_anchor && dimensions >= 210" {
-                        let "has_link_to_img" "1";
-                    }
-                    if eval "dimensions > 100" {
-                        # We assume that a single picture 100x200 contains approx 3 words of text
-                        let "html_img_words" "html_img_words + dimensions / 100";
-                    }
-
-                    let "img_src" "html_attr(token, 'src')";
-                    if eval "starts_with(img_src, 'data:') && contains(img_src, ';base64,')" {
-                        # Has Data URI encoding
-                        let "t.HAS_DATA_URI" "1";
-                    }
-                }
-            } elsif eval "starts_with(token, '<head')" {
-                let "in_head" "in_head + 1";
-            } elsif eval "starts_with(token, '</head')" {
-                let "in_head" "in_head - 1";
-            } elsif eval "starts_with(token, '<body')" {
-                let "in_body" "in_body + 1";
-            } elsif eval "starts_with(token, '</body')" {
-                let "in_body" "in_body - 1";
-            } elsif eval "starts_with(token, '<a ')" {
-                let "in_anchor" "1";
-                let "in_anchor_href_ip" "0";
-                let "in_anchor_href" "to_lowercase(trim(html_attr(token, 'href')))";
-
-                if eval "is_body_part && starts_with(in_anchor_href, 'data:') && contains(in_anchor_href, ';base64,')" {
-                    # Has Data URI encoding
-                    let "t.HAS_DATA_URI" "1";
-                    if eval "contains(in_anchor_href, 'text/')" {
-                        # Uses Data URI encoding to obfuscate plain or HTML in base64
-                        let "t.DATA_URI_OBFU" "1";
-                    }
-                } elsif eval "is_ip_addr(uri_part(in_anchor_href, 'host'))" {
-                    # HTML anchor points to an IP address
-                    let "t.HTTP_TO_IP" "1";
-                    let "in_anchor_href_ip" "1";
-                }
-            } elsif eval "in_anchor && starts_with(token, '</a')" {
-                let "in_anchor" "0";
-            } elsif eval "starts_with(token, '<meta ')" {
-                if eval "eq_ignore_case(html_attr(token, 'http-equiv'), 'refresh') &&
-                         contains_ignore_case(html_attr(token, 'content'), 'url=')" {
-                    # HTML meta refresh tag
-                    let "t.HTML_META_REFRESH_URL" "1";
-                }
-            } elsif eval "starts_with(token, '<link') && is_body_part &&
-                            (contains_ignore_case(html_attr(token, 'rel'), 'stylesheet') ||
-                             contains_ignore_case(html_attr(token, 'href'), '.css') )" {
-                let "t.EXT_CSS" "1";
-            }
-        }
-
-        if eval "is_body_part" {
-            # Check for unbalanced tags
-            if eval "in_head != 0 || in_body != 0" {
-                let "t.HTML_UNBALANCED_TAG" "1";
-            }
-
-            # Check for short HTML parts with a link to an image
-            if eval "has_link_to_img" {
-                if eval "html_char_count < 1024" {
-                    let "t.HTML_SHORT_LINK_IMG_1" "1";
-                } elsif eval "html_char_count < 1536" {
-                    let "t.HTML_SHORT_LINK_IMG_2" "1";
-                } elsif eval "html_char_count < 2048" {
-                    let "t.HTML_SHORT_LINK_IMG_3" "1";
-                }
-            } 
-            
-            if eval "(!has_link_to_img || html_char_count >= 2048) && 
-                    (html_img_words / (html_words + html_img_words) > 0.5)" {
-                # Message contains more images than text
-                let "t.HTML_TEXT_IMG_RATIO" "1";
-            }
-
-            if eval "has_uri && !has_text" {
-                let "t.BODY_URI_ONLY" "1";
-            }
-        }
-    }
-}
-
--- a/resources/config/spamfilter/scripts/mime.sieve
+++ b/resources/config/spamfilter/scripts/mime.sieve
@ -1,232 +0,0 @@
-if eval "!header.mime-version.exists" {
-    if eval "header.content-type.exists || header.content-transfer-encoding.exists" {
-        let "t.MISSING_MIME_VERSION" "1";
-    }
-} elsif eval "header.mime-version.raw_name != 'MIME-Version'" {
-    let "t.MV_CASE" "1";
-}
-
-let "has_text_part" "0";
-let "is_encrypted" "0";
-let "parts_num" "0";
-let "parts_max_len" "0";
-
-if eval "header.Content-Type.exists && !header.Content-Disposition:Content-Transfer-Encoding:MIME-Version.exists && !eq_ignore_case(header.Content-Type, 'text/plain')" {
-    # Only Content-Type header without other MIME headers
-    let "t.MIME_HEADER_CTYPE_ONLY" "1";
-}
-
-foreverypart {
-    let "content_type" "to_lowercase(header.content-type)";
-    let "type" "to_lowercase(header.content-type.type)";
-    let "subtype" "to_lowercase(header.content-type.subtype)";
-    let "cte" "header.content-transfer-encoding";
-    let "part_is_attachment" "is_attachment()";
-
-    if eval "cte != '' && !is_lowercase(cte)" {
-        let "cte" "to_lowercase(cte)";
-        let "t.CTE_CASE" "1";
-    }
-
-    if eval "ends_with(header.content-type.raw, ';')" {
-        # Content-Type header ends with a semi-colon
-        let "t.CT_EXTRA_SEMI" "1";
-    }
-
-    if eval "type == 'multipart'" {
-        if eval "subtype == 'alternative'" {
-            let "has_plain_part" "0";
-            let "has_html_part" "0";
-            
-            let "text_part_words" "";
-            let "text_part_uris" "0";
-
-            let "html_part_words" "";
-            let "html_part_uris" "0";
-
-            foreverypart {
-                let "ma_ct" "to_lowercase(header.content-type)";
-
-                if eval "!has_plain_part && ma_ct == 'text/plain'" {
-                    let "text_part" "part.text";
-                    let "text_part_words" "tokenize(text_part, 'words')";
-                    let "text_part_uris" "count(dedup(uri_part(tokenize(text_part, 'uri_strict'), 'host')))";
-                    let "has_plain_part" "1";
-                } elsif eval "!has_html_part && ma_ct == 'text/html'" {
-                    let "html_part" "html_to_text(part.text)";
-                    let "html_part_words" "tokenize(html_part, 'words')";
-                    let "html_part_uris" "count(dedup(uri_part(tokenize(part.text, 'uri_strict'), 'host')))";
-                    let "has_html_part" "1";
-                }
-            }
-
-            # Multipart message mostly text/html MIME
-            if eval "has_html_part" {
-                if eval "!has_plain_part" {
-                    let "t.MIME_MA_MISSING_TEXT" "1";
-                } 
-            } elsif eval "has_plain_part" {
-                let "t.MIME_MA_MISSING_HTML" "1";
-            }
-
-            # HTML and text parts are different
-            if eval "!t.R_PARTS_DIFFER && has_html_part && has_plain_part &&
-                     (!is_empty(text_part_words) || !is_empty(html_part_words)) &&
-                     cosine_similarity(text_part_words, html_part_words) < 0.95" {
-                let "t.R_PARTS_DIFFER" "1";
-            }
-
-            # Odd URI count between parts
-            if eval "text_part_uris != html_part_uris" {
-                set "t.URI_COUNT_ODD" "1";
-            }
-        } elsif eval "subtype == 'mixed'" {
-            let "num_text_parts" "0";
-            let "has_other_part" "0";
-
-            foreverypart {
-                if eval "eq_ignore_case(header.content-type.type, 'text') && !is_attachment()" {
-                    let "num_text_parts" "num_text_parts + 1";
-                } elsif eval "!eq_ignore_case(header.content-type.type, 'multipart')" {
-                    let "has_other_part" "1";
-                }
-            }
-            
-            # Found multipart/mixed without non-textual part
-            if eval "!has_other_part && num_text_parts < 3" {
-                let "t.CTYPE_MIXED_BOGUS" "1";
-            }
-        } elsif eval "subtype == 'encrypted'" {
-            set "is_encrypted" "1";
-        }
-    } else {
-        if eval "type == 'text'" {
-            # MIME text part claims to be ASCII but isn't
-            if eval "cte == '' || cte == '7bit'" {
-                if eval "!is_ascii(part.raw)" {
-                    let "t.R_BAD_CTE_7BIT" "1";
-                }
-            } else {
-                if eval "cte == 'base64'" {
-                    if eval "is_ascii(part.text)" {
-                        # Has text part encoded in base64 that does not contain any 8bit characters
-                        let "t.MIME_BASE64_TEXT_BOGUS" "1";
-                    } else {
-                        # Has text part encoded in base64
-                        let "t.MIME_BASE64_TEXT" "1";
-                    }
-                }
-
-                if eval "subtype == 'plain' && is_empty(header.content-type.attr.charset)" {
-                    # Charset header is missing
-                    let "t.R_MISSING_CHARSET" "1";
-                }
-            }
-            let "has_text_part" "1";
-        } elsif eval "type == 'application'" {
-            if eval "subtype == 'pkcs7-mime'" {
-                let "t.ENCRYPTED_SMIME" "1";
-                let "part_is_attachment" "0";
-            } elsif eval "subtype == 'pkcs7-signature'" {
-                let "t.SIGNED_SMIME" "1";
-                let "part_is_attachment" "0";
-            } elsif eval "subtype == 'pgp-encrypted'" {
-                let "t.ENCRYPTED_PGP" "1";
-                let "part_is_attachment" "0";
-            } elsif eval "subtype == 'pgp-signature'" {
-                let "t.SIGNED_PGP" "1";
-                let "part_is_attachment" "0";
-            } elsif eval "subtype == 'octet-stream'" {
-                if eval "!is_encrypted &&
-                        !header.content-id.exists && 
-                        (!header.content-disposition.exists || 
-                        (!eq_ignore_case(header.content-disposition.type, 'attachment') && 
-                        is_empty(header.content-disposition.attr.filename)))" {
-                    let "t.CTYPE_MISSING_DISPOSITION" "1";
-                }
-            }
-        }
-
-        # Increase part count
-        let "parts_num" "parts_num + 1";
-        if eval "parts_num == 1" {
-            let "parts_len" "mime_part_len()";
-            if eval "parts_len > parts_max_len" {
-                let "parts_max_len" "parts_len";
-            }
-        }
-    }
-
-    if eval "is_empty(type) && header.content-type.exists" {
-        let "t.BROKEN_CONTENT_TYPE" "1";
-    }
-
-    if eval "part_is_attachment" {
-        # Has a MIME attachment
-        let "t.HAS_ATTACHMENT" "1";
-
-        # Detect and compare mime type
-        let "detected_mime_type" "detect_file_type('mime')";
-        if eval "!is_empty(detected_mime_type)" {
-            if eval "detected_mime_type == content_type" {
-                # Known content-type
-                let "t.MIME_GOOD" "1";
-            } elsif eval "content_type != 'application/octet-stream'" {
-                # Known bad content-type
-                let "t.MIME_BAD" "1";
-            }
-        }
-    }
-
-    # Analyze attachment name
-    let "attach_name" "attachment_name()";
-    if eval "!is_empty(attach_name)" {
-        if eval "has_obscured(attach_name)" {
-            let "t.MIME_BAD_UNICODE" "1";
-        }
-        let "name_parts" "rsplit(to_lowercase(attach_name), '.')";
-        if eval "count(name_parts) > 1" {
-            let "ext_type" "key_get('spam-mime', name_parts[0])";
-            if eval "!is_empty(ext_type)" {
-                let "ext_type_double" "key_get('spam-mime', name_parts[1])";
-                if eval "contains(ext_type, 'BAD')" {
-                    # Bad extension
-                    if eval "contains(ext_type_double, 'BAD')" {
-                        let "t.MIME_DOUBLE_BAD_EXTENSION" "1";
-                    } else {
-                        let "t.MIME_BAD_EXTENSION" "1";
-                    }
-                }
-                if eval "contains(ext_type, 'AR') && contains(ext_type_double, 'AR')" {
-                    # Archive in archive
-                    let "t.MIME_ARCHIVE_IN_ARCHIVE" "1";
-                }
-
-                if eval "contains(ext_type, '/') && 
-                            content_type != 'application/octet-stream' && 
-                            !contains(split(ext_type, '|'), content_type)" {
-                    # Invalid attachment mime type
-                    let "t.MIME_BAD_ATTACHMENT" "1";
-                }
-            }
-        }
-    }
-
-}
-
-# Message contains both text and encrypted parts
-if eval "has_text_part && (t.ENCRYPTED_SMIME || t.ENCRYPTED_PGP)" {
-    let "t.BOGUS_ENCRYPTED_AND_TEXT" "1";
-}
-
-# Message contains only one short part
-if eval "parts_num == 1 && parts_max_len < 64" {
-    let "t.SINGLE_SHORT_PART" "1";
-} elsif eval "parts_max_len == 0" {
-    let "t.COMPLETELY_EMPTY" "1";
-}
-
-# Check for mixed script in body
-if eval "!is_single_script(text_body)" {
-    let "t.R_MIXED_CHARSET" "1";
-}
--- a/resources/config/spamfilter/scripts/pyzor.sieve
+++ b/resources/config/spamfilter/scripts/pyzor.sieve
@ -1,11 +0,0 @@
-# Check message hash against Pyzor on public.pyzor.org:24441 using a 5 second timeout
-let "pyzor_response" "pyzor_check('public.pyzor.org:24441', 5)";
-
-if eval "!is_empty(pyzor_response) && pyzor_response[0] == 200" {
-    let "count" "pyzor_response[1]";
-    let "wl_count" "pyzor_response[2]";
-
-    if eval "count > 5 && (wl_count < 10 || wl_count / count < 0.2)" {
-        let "t.PYZOR" "1";
-    }
-}
--- a/resources/config/spamfilter/scripts/train.sieve
+++ b/resources/config/spamfilter/scripts/train.sieve
@ -1,12 +0,0 @@
-
-
-# Obtain thread name and subject
-let "contents" "thread_name(header.subject) + ' ' + body.to_text";
-
-if eval "env.train == 'spam'" {
-    eval "bayes_train(SPAM_DB, contents, true)";
-} elsif eval "env.train == 'ham'" {
-    eval "bayes_train(SPAM_DB, contents, false)";
-} else {
-    reject "Missing variable 'train'";
-}