diff --git a/Cargo.lock b/Cargo.lock index 2e0e0474..81b146d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6450,6 +6450,7 @@ dependencies = [ "decancer", "hyper 1.5.1", "idna 1.0.3", + "infer 0.16.0", "mail-auth", "mail-builder", "mail-parser", @@ -6457,6 +6458,8 @@ dependencies = [ "nlp", "psl", "reqwest 0.12.9", + "sha1", + "sha2 0.10.8", "smtp-proto", "store", "tokio", diff --git a/crates/common/src/config/spamfilter.rs b/crates/common/src/config/spamfilter.rs index c0573ebd..5fc260ad 100644 --- a/crates/common/src/config/spamfilter.rs +++ b/crates/common/src/config/spamfilter.rs @@ -4,11 +4,15 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ -use std::time::Duration; +use std::{net::SocketAddr, time::Duration}; use ahash::AHashSet; +use hyper::HeaderMap; use mail_parser::HeaderName; -use utils::{config::Config, glob::GlobSet}; +use utils::{ + config::Config, + glob::{GlobMap, GlobSet}, +}; use super::if_block::IfBlock; @@ -19,16 +23,60 @@ pub struct SpamFilterConfig { pub max_rbl_email_checks: usize, pub max_rbl_url_checks: usize, + pub greylist_duration: Option, + + pub pyzor: Option, + pub asn: AsnLookupProvider, + pub list_dmarc_allow: GlobSet, pub list_spf_dkim_allow: GlobSet, pub list_freemail_providers: GlobSet, pub list_disposable_providers: GlobSet, pub list_trusted_domains: GlobSet, pub list_url_redirectors: GlobSet, + pub list_file_extensions: GlobMap, + pub remote_lists: Vec, pub dnsbls: Vec, } +#[derive(Debug, Clone, Default)] +pub enum AsnLookupProvider { + Dns { + ipv4_zone: String, + ipv6_zone: String, + separator: char, + asn_index: usize, + country_index: Option, + }, + Rest { + api: String, + timeout: Duration, + headers: HeaderMap, + asn_path: Vec, + country_path: Option>, + }, + #[default] + None, +} + +#[derive(Debug, Clone)] +pub struct PyzorConfig { + pub address: SocketAddr, + pub timeout: Duration, + pub min_count: u64, + pub min_wl_count: u64, + pub ratio: f64, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FileExtension { + pub known_types: AHashSet, + pub is_bad: bool, + pub is_archive: bool, + pub is_nz: bool, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Element { Url, diff --git a/crates/common/src/scripts/functions/array.rs b/crates/common/src/scripts/functions/array.rs index 832c727b..578c2bda 100644 --- a/crates/common/src/scripts/functions/array.rs +++ b/crates/common/src/scripts/functions/array.rs @@ -82,6 +82,32 @@ pub fn fn_cosine_similarity<'x>(_: &'x Context<'x>, v: Vec) -> Variabl .into() } +pub fn cosine_similarity(a: &[&str], b: &[&str]) -> f64 { + let mut word_freq: HashMap<&str, [u32; 2]> = HashMap::new(); + + for (idx, items) in [a, b].into_iter().enumerate() { + for item in items { + word_freq.entry(item).or_insert([0, 0])[idx] += 1; + } + } + + let mut dot_product = 0; + let mut magnitude_a = 0; + let mut magnitude_b = 0; + + for (_word, count) in word_freq.iter() { + dot_product += count[0] * count[1]; + magnitude_a += count[0] * count[0]; + magnitude_b += count[1] * count[1]; + } + + if magnitude_a != 0 && magnitude_b != 0 { + dot_product as f64 / (magnitude_a as f64).sqrt() / (magnitude_b as f64).sqrt() + } else { + 0.0 + } +} + pub fn fn_jaccard_similarity<'x>(_: &'x Context<'x>, v: Vec) -> Variable { let mut word_freq = [HashSet::new(), HashSet::new()]; diff --git a/crates/smtp/src/inbound/rcpt.rs b/crates/smtp/src/inbound/rcpt.rs index 44213655..9c18ec4a 100644 --- a/crates/smtp/src/inbound/rcpt.rs +++ b/crates/smtp/src/inbound/rcpt.rs @@ -291,6 +291,70 @@ impl Session { } if self.is_allowed().await { + // Greylist + if let Some(greylist_duration) = self + .server + .core + .spam + .greylist_duration + .filter(|_| self.data.authenticated_as.is_none()) + { + let key = format!( + "g:{}:{}:{}", + self.data.remote_ip_str, + self.data.mail_from.as_ref().unwrap().address_lcase, + self.data.rcpt_to.last().unwrap().address_lcase + ); + match self + .server + .lookup_store() + .key_exists(key.clone().into_bytes()) + .await + { + Ok(true) => (), + Ok(false) => { + match self + .server + .lookup_store() + .key_set(key.into_bytes(), vec![], greylist_duration.as_secs().into()) + .await + { + Ok(_) => { + let rcpt = self.data.rcpt_to.pop().unwrap(); + + trc::event!( + Smtp(SmtpEvent::RcptToGreylisted), + SpanId = self.data.session_id, + To = rcpt.address_lcase, + ); + + return self + .write( + concat!( + "422 4.2.2 Greylisted, please try ", + "again in a few moments.\r\n" + ) + .as_bytes(), + ) + .await; + } + Err(err) => { + trc::error!(err + .span_id(self.data.session_id) + .caused_by(trc::location!()) + .details("Failed to set greylist.")); + } + } + } + Err(err) => { + trc::error!(err + .span_id(self.data.session_id) + .caused_by(trc::location!()) + .details("Failed to check greylist.")); + } + } + } + trc::event!( Smtp(SmtpEvent::RcptTo), SpanId = self.data.session_id, diff --git a/crates/spam-filter/Cargo.toml b/crates/spam-filter/Cargo.toml index c0d3ff42..926e38ab 100644 --- a/crates/spam-filter/Cargo.toml +++ b/crates/spam-filter/Cargo.toml @@ -15,12 +15,16 @@ mail-parser = { version = "0.9", features = ["full_encoding", "ludicrous_mode"] mail-builder = { version = "0.3", features = ["ludicrous_mode"] } mail-auth = { version = "0.5" } mail-send = { version = "0.4", default-features = false, features = ["cram-md5", "ring", "tls12"] } +tokio = { version = "1.23", features = ["net", "macros"] } psl = "2" hyper = { version = "1.0.1", features = ["server", "http1", "http2"] } idna = "1.0" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-webpki-roots", "http2", "stream"]} decancer = "3.0.1" unicode-security = "0.1.0" +infer = "0.16" +sha1 = "0.10" +sha2 = "0.10.6" [features] test_mode = [] diff --git a/crates/spam-filter/src/analysis/html.rs b/crates/spam-filter/src/analysis/html.rs new file mode 100644 index 00000000..558a2d6a --- /dev/null +++ b/crates/spam-filter/src/analysis/html.rs @@ -0,0 +1,323 @@ +use std::future::Future; + +use common::Server; +use hyper::Uri; +use mail_parser::MimeHeaders; +use nlp::tokenizers::types::{TokenType, TypesTokenizer}; + +use crate::{modules::html::*, Hostname, SpamFilterContext, TextPart}; + +pub trait SpamFilterAnalyzeHtml: Sync + Send { + fn spam_filter_analyze_html( + &self, + ctx: &mut SpamFilterContext<'_>, + ) -> impl Future + Send; +} + +struct Href { + url_parsed: Option, + host: Option, +} + +impl SpamFilterAnalyzeHtml for Server { + async fn spam_filter_analyze_html(&self, ctx: &mut SpamFilterContext<'_>) { + // Message only has text/html MIME parts + if ctx.input.message.content_type().map_or(false, |ct| { + ct.ctype().eq_ignore_ascii_case("text") + && ct + .subtype() + .unwrap_or_default() + .eq_ignore_ascii_case("html") + }) { + ctx.result.add_tag("MIME_HTML_ONLY"); + } + let mut last_href: Option = None; + let mut html_img_words = 0; + let mut html_text_chars = 0; + let mut in_head: i32 = 0; + let mut in_body: i32 = 0; + + for (part_id, part) in ctx.output.text_parts.iter().enumerate() { + let is_body_part = ctx.input.message.text_body.contains(&part_id) + || ctx.input.message.html_body.contains(&part_id); + + let (html_tokens, tokens) = if let TextPart::Html { + html_tokens, + tokens, + .. + } = part + { + (html_tokens, tokens) + } else { + continue; + }; + let mut has_link_to_img = false; + + for token in html_tokens { + match token { + HtmlToken::StartTag { + name, + attributes, + is_self_closing, + } => match *name { + A => { + if let Some(attr) = attributes.iter().find_map(|(attr, value)| { + if *attr == HREF { + value.as_deref() + } else { + None + } + }) { + let url = attr.trim().to_lowercase(); + let url_parsed = url.parse::().ok(); + let href = Href { + host: url_parsed + .as_ref() + .and_then(|uri| uri.host().map(Hostname::new)), + url_parsed, + }; + + if is_body_part + && attr.starts_with("data:") + && attr.contains(";base64,") + { + // Has Data URI encoding + ctx.result.add_tag("HAS_DATA_URI"); + if attr.contains("text/") { + // Uses Data URI encoding to obfuscate plain or HTML in base64 + ctx.result.add_tag("DATA_URI_OBFU"); + } + } else if href.host.as_ref().map_or(false, |h| h.ip.is_some()) { + // HTML anchor points to an IP address + ctx.result.add_tag("HTTP_TO_IP"); + } + + if !*is_self_closing { + last_href = Some(href); + } + } + } + IMG if is_body_part => { + let mut img_width = 800; + let mut img_height = 600; + + for (attr, value) in attributes { + if let Some(value) = + value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty()) + { + let dimension = match *attr { + WIDTH => &mut img_width, + HEIGHT => &mut img_height, + SRC => { + let src = value.to_ascii_lowercase(); + if src.starts_with("data:") && src.contains(";base64,") + { + // Has Data URI encoding + ctx.result.add_tag("Has Data URI encoding"); + } + continue; + } + _ => { + continue; + } + }; + if let Some(pct) = value.strip_suffix('%') { + if let Ok(pct) = pct.trim().parse::() { + *dimension = (*dimension * pct) / 100; + } + } else if let Ok(value) = value.parse::() { + *dimension = value; + } + } + } + let dimensions = img_width + img_height; + + if last_href.is_some() && dimensions >= 210 { + has_link_to_img = true; + } + + if dimensions > 100 { + // We assume that a single picture 100x200 contains approx 3 words of text + html_img_words += dimensions / 100; + } + } + META => { + let mut has_equiv_refresh = false; + let mut has_content_url = false; + + for (attr, value) in attributes { + if let Some(value) = + value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty()) + { + if *attr == HTTP_EQUIV { + if value.eq_ignore_ascii_case("refresh") { + has_equiv_refresh = true; + } + } else if *attr == CONTENT + && value.to_ascii_lowercase().contains("url=") + { + has_content_url = true; + } + } + } + + if has_equiv_refresh && has_content_url { + // HTML meta refresh tag + ctx.result.add_tag("HTML_META_REFRESH_URL"); + } + } + LINK if is_body_part => { + let mut has_rel_style = false; + let mut has_href_css = false; + + for (attr, value) in attributes { + if let Some(value) = + value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty()) + { + if *attr == REL { + if value.to_ascii_lowercase().contains("stylesheet") { + has_rel_style = true; + } + } else if *attr == HREF + && value.to_ascii_lowercase().ends_with(".css") + { + has_href_css = true; + } + } + } + + if has_rel_style || has_href_css { + // Has external CSS + ctx.result.add_tag("EXT_CSS"); + } + } + HEAD if !*is_self_closing => { + in_head += 1; + } + BODY if !*is_self_closing => { + in_body += 1; + } + _ => {} + }, + HtmlToken::EndTag { name } => match *name { + A => { + last_href = None; + } + HEAD => { + in_head -= 1; + } + BODY => { + in_body -= 1; + } + _ => (), + }, + HtmlToken::Text { text } if in_head > 0 => { + if let Some((href_url, href_host)) = last_href + .as_ref() + .and_then(|href| Some((href.url_parsed.as_ref()?, href.host.as_ref()?))) + { + for token in TypesTokenizer::new(text.as_ref()) + .tokenize_numbers(false) + .tokenize_urls(true) + .tokenize_urls_without_scheme(true) + .tokenize_emails(true) + { + let text_url = match token.word { + TokenType::Url(url) => url.to_lowercase(), + TokenType::UrlNoScheme(url) => { + format!("http://{}", url.to_lowercase()) + } + _ => continue, + }; + let text_url_parsed = + if let Ok(text_url_parsed) = text_url.parse::() { + text_url_parsed + } else { + continue; + }; + + if href_url.scheme().map(|s| s.as_str()).unwrap_or_default() + == "http" + && text_url_parsed + .scheme() + .map(|s| s.as_str()) + .unwrap_or_default() + == "https" + { + // The anchor text contains a distinct scheme compared to the target URL + ctx.result.add_tag("HTTP_TO_HTTPS"); + } + + if let Some(text_url_host) = text_url_parsed.host() { + let text_url_host = Hostname::new(text_url_host); + + if text_url_host.sld_or_default() != href_host.sld_or_default() + { + // The anchor text contains a different domain than the target URL + ctx.result.add_tag("PHISHING"); + } + } + } + } + + if is_body_part { + html_text_chars += text.chars().filter(|t| t.is_alphanumeric()).count(); + } + } + _ => (), + } + } + + if is_body_part { + if in_head != 0 || in_body != 0 { + // HTML tags are not properly closed + ctx.result.add_tag("HTML_UNBALANCED_TAG"); + } + + if has_link_to_img { + match html_text_chars { + 0..1024 => { + ctx.result.add_tag("HTML_SHORT_LINK_IMG_1"); + } + 1024..1536 => { + ctx.result.add_tag("HTML_SHORT_LINK_IMG_2"); + } + 1536..2048 => { + ctx.result.add_tag("HTML_SHORT_LINK_IMG_3"); + } + _ => (), + } + } + + let mut html_words = 0; + let mut html_uris = 0; + + for token in tokens { + match token { + TokenType::Alphabetic(_) + | TokenType::Alphanumeric(_) + | TokenType::Email(_) => { + html_words += 1; + } + TokenType::Url(_) | TokenType::UrlNoScheme(_) => { + html_uris += 1; + } + _ => (), + } + } + + if (!has_link_to_img || html_text_chars >= 2048) + && html_img_words as f64 / (html_words as f64 + html_img_words as f64) > 0.5 + { + // Message contains more images than text + ctx.result.add_tag("HTML_TEXT_IMG_RATIO"); + } + + if html_uris > 0 && html_words == 0 { + // Message only contains URIs in HTML + ctx.result.add_tag("BODY_URI_ONLY"); + } + } + } + } +} diff --git a/crates/spam-filter/src/analysis/ip.rs b/crates/spam-filter/src/analysis/ip.rs index d25a1f38..a9542df1 100644 --- a/crates/spam-filter/src/analysis/ip.rs +++ b/crates/spam-filter/src/analysis/ip.rs @@ -13,14 +13,14 @@ use crate::{modules::dnsbl::is_dnsbl, SpamFilterContext, TextPart}; use super::{ElementLocation, SpamFilterResolver}; -pub trait SpamFilterAnalyzeIpRev: Sync + Send { +pub trait SpamFilterAnalyzeIp: Sync + Send { fn spam_filter_analyze_ip( &self, ctx: &mut SpamFilterContext<'_>, ) -> impl Future + Send; } -impl SpamFilterAnalyzeIpRev for Server { +impl SpamFilterAnalyzeIp for Server { async fn spam_filter_analyze_ip(&self, ctx: &mut SpamFilterContext<'_>) { // IP Address RBL let mut ips = diff --git a/crates/spam-filter/src/analysis/mime.rs b/crates/spam-filter/src/analysis/mime.rs new file mode 100644 index 00000000..3e3aee01 --- /dev/null +++ b/crates/spam-filter/src/analysis/mime.rs @@ -0,0 +1,426 @@ +use std::{collections::HashSet, future::Future, vec}; + +use common::{ + scripts::functions::{array::cosine_similarity, unicode::CharUtils}, + Server, +}; +use hyper::Uri; +use mail_parser::{HeaderName, MimeHeaders, PartType}; +use nlp::tokenizers::types::TokenType; +use unicode_security::MixedScript; + +use crate::{Hostname, SpamFilterContext, TextPart}; + +pub trait SpamFilterAnalyzeMime: Sync + Send { + fn spam_filter_analyze_mime( + &self, + ctx: &mut SpamFilterContext<'_>, + ) -> impl Future + Send; +} + +impl SpamFilterAnalyzeMime for Server { + async fn spam_filter_analyze_mime(&self, ctx: &mut SpamFilterContext<'_>) { + let mut has_mime_version = false; + let mut has_ct = false; + let mut has_cte = false; + let mut had_cd = false; + let mut is_plain_text = false; + + for header in ctx.input.message.headers() { + match &header.name { + HeaderName::MimeVersion => { + if ctx + .input + .message + .raw_message() + .get(header.offset_field..header.offset_start - 1) + != Some(b"MIME-Version") + { + ctx.result.add_tag("MV_CASE"); + } + has_mime_version = true; + } + HeaderName::ContentType => { + has_ct = true; + is_plain_text = header.value().as_content_type().map_or(false, |ct| { + ct.ctype().eq_ignore_ascii_case("text") + && ct + .subtype() + .unwrap_or_default() + .eq_ignore_ascii_case("plain") + }); + } + HeaderName::ContentTransferEncoding => { + has_cte = true; + } + HeaderName::ContentDisposition => { + had_cd = true; + } + _ => (), + } + } + + if !has_mime_version && (has_ct || has_cte) { + ctx.result.add_tag("MISSING_MIME_VERSION"); + } + if has_ct && !is_plain_text && !has_cte && !had_cd { + // Only Content-Type header without other MIME headers + ctx.result.add_tag("MIME_HEADER_CTYPE_ONLY"); + } + let raw_message = ctx.input.message.raw_message(); + + let mut has_text_part = false; + let mut is_encrypted = false; + let mut is_encrypted_smime = false; + let mut is_encrypted_pgp = false; + + let mut num_parts = 0; + let mut num_parts_size = 0; + + for (part_id, part) in ctx.input.message.parts.iter().enumerate() { + let mut ct = None; + let mut cd = None; + let mut ct_type = String::new(); + let mut ct_subtype = String::new(); + let mut cte = String::new(); + let mut is_attachment = ctx.input.message.attachments.contains(&part_id); + let mut has_content_id = false; + + for header in part.headers() { + match &header.name { + HeaderName::ContentType => { + if let Some(ct_) = header.value().as_content_type() { + ct_type = ct_.ctype().to_ascii_lowercase(); + ct_subtype = ct_.subtype().unwrap_or_default().to_ascii_lowercase(); + ct = Some(ct_); + } + + if ct_type.is_empty() { + // Content-Type header can't be parsed + ctx.result.add_tag("BROKEN_CONTENT_TYPE"); + } + + if raw_message + .get(header.offset_start..header.offset_end) + .and_then(|s| s.trim_ascii_end().last()) + == Some(&b';') + { + // Content-Type header ends with a semi-colon + ctx.result.add_tag("CT_EXTRA_SEMI"); + } + } + HeaderName::ContentTransferEncoding => { + let cte_ = header.value().as_text().unwrap_or_default(); + cte = cte_.to_ascii_lowercase(); + + if cte != cte_ { + ctx.result.add_tag("CTE_CASE"); + } + } + HeaderName::ContentDisposition => { + cd = header.value().as_content_type(); + } + HeaderName::ContentId => { + has_content_id = true; + } + _ => (), + } + } + + match ct_type.as_str() { + "multipart" => { + let part_ids = match &part.body { + PartType::Multipart(parts) => parts.as_slice(), + _ => &[], + }; + + match ct_subtype.as_str() { + "alternative" => { + let mut has_plain_part = false; + let mut has_html_part = false; + + let mut text_part_words = vec![]; + let mut text_part_uris = 0; + + let mut html_part_words = vec![]; + let mut html_part_uris = 0; + + for text_part in part_ids.iter().map(|id| &ctx.output.text_parts[*id]) { + match text_part { + TextPart::Plain { tokens, .. } if !has_plain_part => { + words_and_uris( + tokens, + &mut text_part_words, + &mut text_part_uris, + ); + has_plain_part = true; + } + TextPart::Html { tokens, .. } if !has_html_part => { + words_and_uris( + tokens, + &mut html_part_words, + &mut html_part_uris, + ); + has_html_part = true; + } + _ => (), + } + } + + // Multipart message mostly text/html MIME + if has_html_part { + if !has_plain_part { + ctx.result.add_tag("MIME_MA_MISSING_TEXT"); + } + } else if has_plain_part { + ctx.result.add_tag("MIME_MA_MISSING_HTML"); + } + + // HTML and text parts are different + if has_plain_part + && has_html_part + && (!text_part_words.is_empty() || !html_part_words.is_empty()) + && cosine_similarity(&text_part_words, &html_part_words) < 0.95 + { + ctx.result.add_tag("R_PARTS_DIFFER"); + } + + // Odd URI count between parts + if text_part_uris != html_part_uris { + ctx.result.add_tag("URI_COUNT_ODD"); + } + } + "mixed" => { + let mut num_text_parts = 0; + let mut has_other_parts = false; + + for (sub_part_id, sub_part) in part_ids + .iter() + .map(|id| (*id, &ctx.input.message.parts[*id])) + { + let ctype = sub_part + .content_type() + .map(|ct| ct.ctype()) + .unwrap_or_default(); + + if ctype.eq_ignore_ascii_case("text") + && !ctx.input.message.attachments.contains(&sub_part_id) + { + num_text_parts += 1; + } else if !ctype.eq_ignore_ascii_case("multipart") { + has_other_parts = true; + } + } + + // Found multipart/mixed without non-textual part + if !has_other_parts && num_text_parts < 3 { + ctx.result.add_tag("CTYPE_MIXED_BOGUS"); + } + } + "encrypted" => { + is_encrypted = true; + } + _ => (), + } + + continue; + } + "text" => { + let mut is_7bit = false; + match cte.as_str() { + "" | "7bit" => { + if raw_message + .get(part.raw_body_offset()..part.raw_end_offset()) + .map_or(false, |bytes| !bytes.is_ascii()) + { + // MIME text part claims to be ASCII but isn't + ctx.result.add_tag("R_BAD_CTE_7BIT"); + } + is_7bit = true; + } + "base64" => { + if part.contents().is_ascii() { + // Has text part encoded in base64 that does not contain any 8bit characters + ctx.result.add_tag("MIME_BASE64_TEXT_BOGUS"); + } else { + // Has text part encoded in base64 + ctx.result.add_tag("MIME_BASE64_TEXT"); + } + } + _ => (), + } + + if !is_7bit + && ct_subtype == "plain" + && ct + .and_then(|ct| ct.attribute("charset")) + .map_or(true, |c| c.is_empty()) + { + // Charset header is missing + ctx.result.add_tag("R_MISSING_CHARSET"); + } + + match &part.body { + PartType::Text(text) | PartType::Html(text) + if ctx.input.message.text_body.contains(&part_id) + || ctx.input.message.html_body.contains(&part_id) => + { + if !text.as_ref().is_single_script() { + // Text part contains multiple scripts + ctx.result.add_tag("R_MIXED_CHARSET"); + } + } + _ => (), + } + + has_text_part = true; + } + "application" => match ct_subtype.as_str() { + "pkcs7-mime" => { + ctx.result.add_tag("ENCRYPTED_SMIME"); + is_attachment = false; + is_encrypted_smime = true; + } + "pkcs7-signature" => { + ctx.result.add_tag("SIGNED_SMIME"); + is_attachment = false; + } + "pgp-encrypted" => { + ctx.result.add_tag("ENCRYPTED_PGP"); + is_attachment = false; + is_encrypted_pgp = true; + } + "pgp-signature" => { + ctx.result.add_tag("SIGNED_PGP"); + is_attachment = false; + } + "octet-stream" => { + if !is_encrypted + && !has_content_id + && cd.map_or(true, |cd| { + cd.attribute("type") + .unwrap_or_default() + .to_ascii_lowercase() + != "attachment" + && !cd.has_attribute("filename") + }) + { + ctx.result.add_tag("CTYPE_MISSING_DISPOSITION"); + } + } + _ => (), + }, + _ => (), + } + + num_parts += 1; + num_parts_size += part.len(); + + let ct_full = format!("{ct_type}/{ct_subtype}"); + + if is_attachment { + // Has a MIME attachment + ctx.result.add_tag("HAS_ATTACHMENT"); + match &part.body { + PartType::Binary(bytes) | PartType::InlineBinary(bytes) => { + if let Some(t) = infer::get(bytes.as_ref()) { + if t.mime_type() != ct_full { + // Known content-type + ctx.result.add_tag("MIME_GOOD"); + } else if ct_full != "application/octet-stream" { + // Known bad content-type + ctx.result.add_tag("MIME_BAD"); + } + } + } + + _ => (), + } + } + + // Analyze attachment name + if let Some(attach_name) = part.attachment_name() { + if attach_name.chars().any(|c| c.is_obscured()) { + // Attachment name contains zero-width space + ctx.result.add_tag("MIME_BAD_UNICODE"); + } + let attach_name = attach_name.trim().to_lowercase(); + if let Some((name, ext)) = attach_name.rsplit_once('.').and_then(|(name, ext)| { + Some((name, self.core.spam.list_file_extensions.get(ext)?)) + }) { + let sub_ext = name + .rsplit_once('.') + .and_then(|(_, ext)| self.core.spam.list_file_extensions.get(ext)); + + if ext.is_bad { + // Attachment has a bad extension + if sub_ext.map_or(false, |e| e.is_bad) { + ctx.result.add_tag("MIME_DOUBLE_BAD_EXTENSION"); + } else { + ctx.result.add_tag("MIME_BAD_EXTENSION"); + } + } + + if ext.is_archive && sub_ext.map_or(false, |e| e.is_archive) { + // Archive in archive + ctx.result.add_tag("MIME_ARCHIVE_IN_ARCHIVE"); + } + + if !ext.known_types.is_empty() + && ct_full != "application/octet-stream" + && !ext.known_types.contains(&ct_full) + { + // Invalid attachment mime type + ctx.result.add_tag("MIME_BAD_ATTACHMENT"); + } + } + } + } + + match num_parts_size { + 0 => { + // Message contains no parts + ctx.result.add_tag("COMPLETELY_EMPTY"); + } + 1..64 if num_parts == 1 => { + // Message contains only one short part + ctx.result.add_tag("SINGLE_SHORT_PART"); + } + _ => (), + } + + if has_text_part && (is_encrypted_pgp || is_encrypted_smime) { + // Message contains both text and encrypted parts + ctx.result.add_tag("BOGUS_ENCRYPTED_AND_TEXT"); + } + } +} + +fn words_and_uris<'x, T: AsRef>( + tokens: &'x [TokenType], + words: &mut Vec<&'x str>, + uri_count: &mut usize, +) { + let mut uris = HashSet::new(); + + for token in tokens { + match token { + TokenType::Alphabetic(v) | TokenType::Alphanumeric(v) => { + words.push(v.as_ref()); + } + TokenType::Url(v) => { + if let Some(host) = v + .as_ref() + .parse::() + .ok() + .and_then(|uri| uri.host().map(Hostname::new)) + { + uris.insert(host.sld.unwrap_or(host.fqdn)); + } + } + _ => (), + } + } + + *uri_count = uris.len(); +} diff --git a/crates/spam-filter/src/analysis/mod.rs b/crates/spam-filter/src/analysis/mod.rs index ddb395b6..7146062d 100644 --- a/crates/spam-filter/src/analysis/mod.rs +++ b/crates/spam-filter/src/analysis/mod.rs @@ -19,9 +19,12 @@ pub mod domain; pub mod ehlo; pub mod from; pub mod headers; +pub mod html; pub mod init; pub mod ip; pub mod messageid; +pub mod mime; +pub mod pyzor; pub mod received; pub mod recipient; pub mod replyto; diff --git a/crates/spam-filter/src/analysis/pyzor.rs b/crates/spam-filter/src/analysis/pyzor.rs new file mode 100644 index 00000000..1fec2b52 --- /dev/null +++ b/crates/spam-filter/src/analysis/pyzor.rs @@ -0,0 +1,35 @@ +use std::future::Future; + +use common::Server; + +use crate::{modules::pyzor::pyzor_check, SpamFilterContext}; + +pub trait SpamFilterAnalyzePyzor: Sync + Send { + fn spam_filter_analyze_pyzor( + &self, + ctx: &mut SpamFilterContext<'_>, + ) -> impl Future + Send; +} + +impl SpamFilterAnalyzePyzor for Server { + async fn spam_filter_analyze_pyzor(&self, ctx: &mut SpamFilterContext<'_>) { + if let Some(config) = &self.core.spam.pyzor { + match pyzor_check(ctx.input.message, config).await { + Ok(Some(result)) => { + if result.code == 200 + && result.count > config.min_count + && (result.wl_count < config.min_wl_count + || (result.wl_count as f64 / result.count as f64) < config.ratio) + { + ctx.result.add_tag("PYZOR"); + } + let todo = "log time"; + } + Ok(None) => {} + Err(err) => { + trc::error!(err.span_id(ctx.input.span_id)); + } + } + } + } +} diff --git a/crates/spam-filter/src/analysis/url.rs b/crates/spam-filter/src/analysis/url.rs index 94b73dd4..65ac8df0 100644 --- a/crates/spam-filter/src/analysis/url.rs +++ b/crates/spam-filter/src/analysis/url.rs @@ -438,7 +438,7 @@ fn is_single_html_url>(html_tokens: &[HtmlToken], tokens: &[TokenT url_count = 0; for token in html_tokens { - if matches!(token, HtmlToken::StartTag { name, attributes } if *name == A && attributes.iter().any(|(k, _)| *k == HREF)) + if matches!(token, HtmlToken::StartTag { name, attributes, .. } if *name == A && attributes.iter().any(|(k, _)| *k == HREF)) { url_count += 1; } diff --git a/crates/spam-filter/src/modules/html.rs b/crates/spam-filter/src/modules/html.rs index 3d5eeb72..ae181abc 100644 --- a/crates/spam-filter/src/modules/html.rs +++ b/crates/spam-filter/src/modules/html.rs @@ -5,6 +5,7 @@ pub enum HtmlToken { StartTag { name: u64, attributes: Vec<(u64, Option)>, + is_self_closing: bool, }, EndTag { name: u64, @@ -18,10 +19,46 @@ pub enum HtmlToken { } pub(crate) const A: u64 = b'a' as u64; +pub(crate) const IMG: u64 = (b'i' as u64) | (b'm' as u64) << 8 | (b'g' as u64) << 16; +pub(crate) const HEAD: u64 = + (b'h' as u64) | (b'e' as u64) << 8 | (b'a' as u64) << 16 | (b'd' as u64) << 24; +pub(crate) const BODY: u64 = + (b'b' as u64) | (b'o' as u64) << 8 | (b'd' as u64) << 16 | (b'y' as u64) << 24; +pub(crate) const META: u64 = + (b'm' as u64) | (b'e' as u64) << 8 | (b't' as u64) << 16 | (b'a' as u64) << 24; +pub(crate) const LINK: u64 = + (b'l' as u64) | (b'i' as u64) << 8 | (b'n' as u64) << 16 | (b'k' as u64) << 24; pub(crate) const HREF: u64 = (b'h' as u64) | (b'r' as u64) << 8 | (b'e' as u64) << 16 | (b'f' as u64) << 24; pub(crate) const SRC: u64 = (b's' as u64) | (b'r' as u64) << 8 | (b'c' as u64) << 16; +pub(crate) const WIDTH: u64 = (b'w' as u64) + | (b'i' as u64) << 8 + | (b'd' as u64) << 16 + | (b't' as u64) << 24 + | (b'h' as u64) << 32; +pub(crate) const HEIGHT: u64 = (b'h' as u64) + | (b'e' as u64) << 8 + | (b'i' as u64) << 16 + | (b'g' as u64) << 24 + | (b'h' as u64) << 32 + | (b't' as u64) << 40; +pub(crate) const REL: u64 = (b'r' as u64) | (b'e' as u64) << 8 | (b'l' as u64) << 16; +pub(crate) const CONTENT: u64 = (b'c' as u64) + | (b'o' as u64) << 8 + | (b'n' as u64) << 16 + | (b't' as u64) << 24 + | (b'e' as u64) << 32 + | (b'n' as u64) << 40 + | (b't' as u64) << 48; +pub(crate) const HTTP_EQUIV: u64 = (b'h' as u64) + | (b't' as u64) << 8 + | (b't' as u64) << 16 + | (b'p' as u64) << 24 + | (b'-' as u64) << 32 + | (b'e' as u64) << 40 + | (b'q' as u64) << 48 + | (b'u' as u64) << 56; pub fn html_to_tokens(input: &str) -> Vec { let input = input.as_bytes(); @@ -106,6 +143,7 @@ pub fn html_to_tokens(input: &str) -> Vec { } let mut in_quote = false; + let mut is_self_closing = false; let mut key: u64 = 0; let mut shift = 0; @@ -123,6 +161,9 @@ pub fn html_to_tokens(input: &str) -> Vec { key |= ((ch - b'A' + b'a') as u64) << shift; shift += 8; } + b'/' if !in_quote => { + is_self_closing = true; + } b'>' if !in_quote => { if shift != 0 { if tag == 0 { @@ -205,6 +246,7 @@ pub fn html_to_tokens(input: &str) -> Vec { tags.push(HtmlToken::StartTag { name: tag, attributes, + is_self_closing, }); } } @@ -292,7 +334,8 @@ mod tests { tokens, vec![HtmlToken::StartTag { name: 7760228, - attributes: vec![] + attributes: vec![], + is_self_closing: false }] ); } @@ -325,14 +368,16 @@ mod tests { vec![ HtmlToken::StartTag { name: 7760228, - attributes: vec![] + attributes: vec![], + is_self_closing: false }, HtmlToken::Text { text: "Hello,".to_string() }, HtmlToken::StartTag { name: 1851879539, - attributes: vec![] + attributes: vec![], + is_self_closing: false }, HtmlToken::Text { text: " \" world \"".to_string() @@ -358,15 +403,18 @@ mod tests { attributes: vec![ (1701869940, Some("text".to_string())), (435761734006, Some("test".to_string())) - ] + ], + is_self_closing: false }, HtmlToken::StartTag { name: 111516266162547, - attributes: vec![] + attributes: vec![], + is_self_closing: true }, HtmlToken::StartTag { name: 6647407, - attributes: vec![(1920234593, None)] + attributes: vec![(1920234593, None)], + is_self_closing: true }, HtmlToken::StartTag { name: 97, @@ -374,7 +422,8 @@ mod tests { (98, Some("1".to_string())), (98, None), (99, Some("123".to_string())) - ] + ], + is_self_closing: false } ] ); diff --git a/crates/spam-filter/src/modules/mod.rs b/crates/spam-filter/src/modules/mod.rs index f75ebf55..707f618b 100644 --- a/crates/spam-filter/src/modules/mod.rs +++ b/crates/spam-filter/src/modules/mod.rs @@ -1,4 +1,5 @@ pub mod dnsbl; pub mod html; +pub mod pyzor; pub mod remote_list; pub mod sanitize; diff --git a/crates/spam-filter/src/modules/pyzor.rs b/crates/spam-filter/src/modules/pyzor.rs index 6523f62b..2e26be0c 100644 --- a/crates/spam-filter/src/modules/pyzor.rs +++ b/crates/spam-filter/src/modules/pyzor.rs @@ -4,16 +4,14 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ -use sieve::{runtime::Variable, FunctionMap}; - -use super::PluginContext; - use std::{ borrow::Cow, io::Write, + net::SocketAddr, time::{Duration, SystemTime}, }; +use common::config::spamfilter::PyzorConfig; use mail_parser::{decoders::html::add_html_token, Message, PartType}; use nlp::tokenizers::types::{TokenType, TypesTokenizer}; use sha1::{Digest, Sha1}; @@ -24,29 +22,27 @@ const ATOMIC_NUM_LINES: usize = 4; const DIGEST_SPEC: &[(usize, usize)] = &[(20, 3), (60, 3)]; #[derive(Default, Debug, PartialEq, Eq)] -struct PyzorResponse { - code: u32, - count: u64, - wl_count: u64, +pub(crate) struct PyzorResponse { + pub code: u32, + pub count: u64, + pub wl_count: u64, } -pub fn register(plugin_id: u32, fnc_map: &mut FunctionMap) { - fnc_map.set_external_function("pyzor_check", plugin_id, 2); -} - -pub async fn exec(ctx: PluginContext<'_>) -> trc::Result { +pub(crate) async fn pyzor_check( + message: &Message<'_>, + config: &PyzorConfig, +) -> trc::Result> { // Make sure there is at least one text part - if !ctx - .message + if !message .parts .iter() .any(|p| matches!(p.body, PartType::Text(_) | PartType::Html(_))) { - return Ok(Variable::default()); + return Ok(None); } // Hash message - let request = ctx.message.pyzor_check_message(); + let request = message.pyzor_check_message(); #[cfg(feature = "test_mode")] { @@ -74,35 +70,21 @@ pub async fn exec(ctx: PluginContext<'_>) -> trc::Result { } } - let address = ctx.arguments[0].to_string(); - let timeout = Duration::from_secs((ctx.arguments[1].to_integer() as u64).clamp(5, 60)); - // Send message to address - pyzor_send_message(address.as_ref(), timeout, &request) + pyzor_send_message(config.address, config.timeout, &request) .await .map(Into::into) .map_err(|err| { trc::SpamEvent::PyzorError .into_err() - .ctx(trc::Key::Url, address.to_string()) + .ctx(trc::Key::Url, config.address.to_string()) .reason(err) .details("Pyzor failed") }) } -impl From for Variable { - fn from(response: PyzorResponse) -> Self { - vec![ - Variable::from(response.code), - Variable::from(response.count), - Variable::from(response.wl_count), - ] - .into() - } -} - async fn pyzor_send_message( - addr: &str, + addr: SocketAddr, timeout: Duration, message: &str, ) -> std::io::Result { @@ -451,7 +433,7 @@ mod test { async fn send_message() { assert_eq!( pyzor_send_message( - "public.pyzor.org:24441", + "public.pyzor.org:24441".parse().unwrap(), Duration::from_secs(10), concat!( "Op: check\n", diff --git a/crates/trc/src/event/description.rs b/crates/trc/src/event/description.rs index 6a2e83ad..fb2eb07f 100644 --- a/crates/trc/src/event/description.rs +++ b/crates/trc/src/event/description.rs @@ -442,6 +442,7 @@ impl SmtpEvent { SmtpEvent::RcptToDuplicate => "Duplicate RCPT TO", SmtpEvent::RcptToRewritten => "RCPT TO address rewritten", SmtpEvent::RcptToMissing => "RCPT TO address missing", + SmtpEvent::RcptToGreylisted => "RCPT TO greylisted", SmtpEvent::TooManyRecipients => "Too many recipients", SmtpEvent::TooManyInvalidRcpt => "Too many invalid recipients", SmtpEvent::RawInput => "Raw SMTP input received", @@ -552,6 +553,7 @@ impl SmtpEvent { } SmtpEvent::RcptToRewritten => "The envelope recipient address was rewritten", SmtpEvent::RcptToMissing => "The remote client issued a DATA command before RCPT TO", + SmtpEvent::RcptToGreylisted => "The recipient was greylisted", SmtpEvent::TooManyRecipients => { "The remote client exceeded the number of recipients allowed" } diff --git a/crates/trc/src/event/level.rs b/crates/trc/src/event/level.rs index 927d2501..8a900818 100644 --- a/crates/trc/src/event/level.rs +++ b/crates/trc/src/event/level.rs @@ -186,6 +186,7 @@ impl EventType { | SmtpEvent::MailboxDoesNotExist | SmtpEvent::RelayNotAllowed | SmtpEvent::RcptTo + | SmtpEvent::RcptToGreylisted | SmtpEvent::TooManyInvalidRcpt | SmtpEvent::Vrfy | SmtpEvent::VrfyNotFound diff --git a/crates/trc/src/lib.rs b/crates/trc/src/lib.rs index 19ffb5db..c7d9eb22 100644 --- a/crates/trc/src/lib.rs +++ b/crates/trc/src/lib.rs @@ -393,6 +393,7 @@ pub enum SmtpEvent { RcptToDuplicate, RcptToRewritten, RcptToMissing, + RcptToGreylisted, TooManyRecipients, TooManyInvalidRcpt, RawInput, diff --git a/crates/trc/src/serializers/binary.rs b/crates/trc/src/serializers/binary.rs index 26c82e75..2ddd8e4a 100644 --- a/crates/trc/src/serializers/binary.rs +++ b/crates/trc/src/serializers/binary.rs @@ -866,6 +866,7 @@ impl EventType { EventType::Security(SecurityEvent::ScanBan) => 558, EventType::Store(StoreEvent::AzureError) => 559, EventType::TlsRpt(TlsRptEvent::RecordNotFound) => 560, + EventType::Smtp(SmtpEvent::RcptToGreylisted) => 561, } } @@ -1472,6 +1473,7 @@ impl EventType { 558 => Some(EventType::Security(SecurityEvent::ScanBan)), 559 => Some(EventType::Store(StoreEvent::AzureError)), 560 => Some(EventType::TlsRpt(TlsRptEvent::RecordNotFound)), + 561 => Some(EventType::Smtp(SmtpEvent::RcptToGreylisted)), _ => None, } } diff --git a/resources/config/spamfilter/scripts/greylist.sieve b/resources/config/spamfilter/scripts/greylist.sieve deleted file mode 100644 index 17d86c5b..00000000 --- a/resources/config/spamfilter/scripts/greylist.sieve +++ /dev/null @@ -1,9 +0,0 @@ - -set "triplet" "g:${env.remote_ip}.${envelope.from}.${envelope.to}"; - -if eval "!key_exists(SPAM_DB, triplet)" { - # Greylist sender for 30 days - eval "key_set(SPAM_DB, triplet, '', 2592000)"; - reject "422 4.2.2 Greylisted, please try again in a few moments."; - stop; -} diff --git a/resources/config/spamfilter/scripts/html.sieve b/resources/config/spamfilter/scripts/html.sieve deleted file mode 100644 index f64320bf..00000000 --- a/resources/config/spamfilter/scripts/html.sieve +++ /dev/null @@ -1,148 +0,0 @@ - -# Message only has text/html MIME parts -if eval "header.content-type == 'text/html'" { - let "t.MIME_HTML_ONLY" "1"; -} - -foreverypart { - if eval "eq_ignore_case(header.content-type, 'text/html')" { - # Tokenize HTML - let "is_body_part" "is_body()"; - let "html_tokens" "tokenize(part.text, 'html')"; - let "html_tokens_len" "len(html_tokens)"; - let "html_char_count" "0"; - let "html_space_count" "0"; - let "html_img_words" "0"; - let "html_words" "0"; - let "has_link_to_img" "0"; - let "has_uri" "0"; - let "has_text" "0"; - let "in_head" "0"; - let "in_body" "0"; - let "in_anchor" "0"; - let "in_anchor_href_ip" "0"; - let "in_anchor_href" ""; - - let "i" "0"; - while "i < html_tokens_len" { - let "token" "html_tokens[i]"; - let "i" "i + 1"; - - # Tokens starting with '_' are text nodes - if eval "starts_with(token, '_')" { - if eval "in_head == 0" { - let "html_char_count" "html_char_count + count_chars(token)"; - let "html_space_count" "html_space_count + count_spaces(token)"; - - let "text" "to_lowercase(trim(strip_prefix(token, '_')))"; - let "html_words" "html_words + len(tokenize(text, 'words'))"; - - let "uris" "tokenize(text, 'uri')"; - - if eval "!is_empty(uris)" { - let "has_uri" "1"; - let "uri" "uris[0]"; - - if eval "in_anchor && !is_empty(in_anchor_href)" { - if eval "contains(text, '://') && - uri_part(uri, 'scheme') != uri_part(in_anchor_href, 'scheme')" { - # The anchor text contains a distinct scheme compared to the target URL - let "t.HTTP_TO_HTTPS" "1"; - } - if eval "(!in_anchor_href_ip && (domain_part(uri_part(uri, 'host'), 'sld') != domain_part(uri_part(in_anchor_href, 'host'), 'sld'))) || - (in_anchor_href_ip && (uri_part(uri, 'host') != uri_part(in_anchor_href, 'host')))" { - let "t.PHISHING" "1"; - } - } - } elsif eval "!is_empty(text)" { - let "has_text" "1"; - } - } - } elsif eval "starts_with(token, '= 210" { - let "has_link_to_img" "1"; - } - if eval "dimensions > 100" { - # We assume that a single picture 100x200 contains approx 3 words of text - let "html_img_words" "html_img_words + dimensions / 100"; - } - - let "img_src" "html_attr(token, 'src')"; - if eval "starts_with(img_src, 'data:') && contains(img_src, ';base64,')" { - # Has Data URI encoding - let "t.HAS_DATA_URI" "1"; - } - } - } elsif eval "starts_with(token, '= 2048) && - (html_img_words / (html_words + html_img_words) > 0.5)" { - # Message contains more images than text - let "t.HTML_TEXT_IMG_RATIO" "1"; - } - - if eval "has_uri && !has_text" { - let "t.BODY_URI_ONLY" "1"; - } - } - } -} - diff --git a/resources/config/spamfilter/scripts/mime.sieve b/resources/config/spamfilter/scripts/mime.sieve deleted file mode 100644 index cb6fdbec..00000000 --- a/resources/config/spamfilter/scripts/mime.sieve +++ /dev/null @@ -1,232 +0,0 @@ -if eval "!header.mime-version.exists" { - if eval "header.content-type.exists || header.content-transfer-encoding.exists" { - let "t.MISSING_MIME_VERSION" "1"; - } -} elsif eval "header.mime-version.raw_name != 'MIME-Version'" { - let "t.MV_CASE" "1"; -} - -let "has_text_part" "0"; -let "is_encrypted" "0"; -let "parts_num" "0"; -let "parts_max_len" "0"; - -if eval "header.Content-Type.exists && !header.Content-Disposition:Content-Transfer-Encoding:MIME-Version.exists && !eq_ignore_case(header.Content-Type, 'text/plain')" { - # Only Content-Type header without other MIME headers - let "t.MIME_HEADER_CTYPE_ONLY" "1"; -} - -foreverypart { - let "content_type" "to_lowercase(header.content-type)"; - let "type" "to_lowercase(header.content-type.type)"; - let "subtype" "to_lowercase(header.content-type.subtype)"; - let "cte" "header.content-transfer-encoding"; - let "part_is_attachment" "is_attachment()"; - - if eval "cte != '' && !is_lowercase(cte)" { - let "cte" "to_lowercase(cte)"; - let "t.CTE_CASE" "1"; - } - - if eval "ends_with(header.content-type.raw, ';')" { - # Content-Type header ends with a semi-colon - let "t.CT_EXTRA_SEMI" "1"; - } - - if eval "type == 'multipart'" { - if eval "subtype == 'alternative'" { - let "has_plain_part" "0"; - let "has_html_part" "0"; - - let "text_part_words" ""; - let "text_part_uris" "0"; - - let "html_part_words" ""; - let "html_part_uris" "0"; - - foreverypart { - let "ma_ct" "to_lowercase(header.content-type)"; - - if eval "!has_plain_part && ma_ct == 'text/plain'" { - let "text_part" "part.text"; - let "text_part_words" "tokenize(text_part, 'words')"; - let "text_part_uris" "count(dedup(uri_part(tokenize(text_part, 'uri_strict'), 'host')))"; - let "has_plain_part" "1"; - } elsif eval "!has_html_part && ma_ct == 'text/html'" { - let "html_part" "html_to_text(part.text)"; - let "html_part_words" "tokenize(html_part, 'words')"; - let "html_part_uris" "count(dedup(uri_part(tokenize(part.text, 'uri_strict'), 'host')))"; - let "has_html_part" "1"; - } - } - - # Multipart message mostly text/html MIME - if eval "has_html_part" { - if eval "!has_plain_part" { - let "t.MIME_MA_MISSING_TEXT" "1"; - } - } elsif eval "has_plain_part" { - let "t.MIME_MA_MISSING_HTML" "1"; - } - - # HTML and text parts are different - if eval "!t.R_PARTS_DIFFER && has_html_part && has_plain_part && - (!is_empty(text_part_words) || !is_empty(html_part_words)) && - cosine_similarity(text_part_words, html_part_words) < 0.95" { - let "t.R_PARTS_DIFFER" "1"; - } - - # Odd URI count between parts - if eval "text_part_uris != html_part_uris" { - set "t.URI_COUNT_ODD" "1"; - } - } elsif eval "subtype == 'mixed'" { - let "num_text_parts" "0"; - let "has_other_part" "0"; - - foreverypart { - if eval "eq_ignore_case(header.content-type.type, 'text') && !is_attachment()" { - let "num_text_parts" "num_text_parts + 1"; - } elsif eval "!eq_ignore_case(header.content-type.type, 'multipart')" { - let "has_other_part" "1"; - } - } - - # Found multipart/mixed without non-textual part - if eval "!has_other_part && num_text_parts < 3" { - let "t.CTYPE_MIXED_BOGUS" "1"; - } - } elsif eval "subtype == 'encrypted'" { - set "is_encrypted" "1"; - } - } else { - if eval "type == 'text'" { - # MIME text part claims to be ASCII but isn't - if eval "cte == '' || cte == '7bit'" { - if eval "!is_ascii(part.raw)" { - let "t.R_BAD_CTE_7BIT" "1"; - } - } else { - if eval "cte == 'base64'" { - if eval "is_ascii(part.text)" { - # Has text part encoded in base64 that does not contain any 8bit characters - let "t.MIME_BASE64_TEXT_BOGUS" "1"; - } else { - # Has text part encoded in base64 - let "t.MIME_BASE64_TEXT" "1"; - } - } - - if eval "subtype == 'plain' && is_empty(header.content-type.attr.charset)" { - # Charset header is missing - let "t.R_MISSING_CHARSET" "1"; - } - } - let "has_text_part" "1"; - } elsif eval "type == 'application'" { - if eval "subtype == 'pkcs7-mime'" { - let "t.ENCRYPTED_SMIME" "1"; - let "part_is_attachment" "0"; - } elsif eval "subtype == 'pkcs7-signature'" { - let "t.SIGNED_SMIME" "1"; - let "part_is_attachment" "0"; - } elsif eval "subtype == 'pgp-encrypted'" { - let "t.ENCRYPTED_PGP" "1"; - let "part_is_attachment" "0"; - } elsif eval "subtype == 'pgp-signature'" { - let "t.SIGNED_PGP" "1"; - let "part_is_attachment" "0"; - } elsif eval "subtype == 'octet-stream'" { - if eval "!is_encrypted && - !header.content-id.exists && - (!header.content-disposition.exists || - (!eq_ignore_case(header.content-disposition.type, 'attachment') && - is_empty(header.content-disposition.attr.filename)))" { - let "t.CTYPE_MISSING_DISPOSITION" "1"; - } - } - } - - # Increase part count - let "parts_num" "parts_num + 1"; - if eval "parts_num == 1" { - let "parts_len" "mime_part_len()"; - if eval "parts_len > parts_max_len" { - let "parts_max_len" "parts_len"; - } - } - } - - if eval "is_empty(type) && header.content-type.exists" { - let "t.BROKEN_CONTENT_TYPE" "1"; - } - - if eval "part_is_attachment" { - # Has a MIME attachment - let "t.HAS_ATTACHMENT" "1"; - - # Detect and compare mime type - let "detected_mime_type" "detect_file_type('mime')"; - if eval "!is_empty(detected_mime_type)" { - if eval "detected_mime_type == content_type" { - # Known content-type - let "t.MIME_GOOD" "1"; - } elsif eval "content_type != 'application/octet-stream'" { - # Known bad content-type - let "t.MIME_BAD" "1"; - } - } - } - - # Analyze attachment name - let "attach_name" "attachment_name()"; - if eval "!is_empty(attach_name)" { - if eval "has_obscured(attach_name)" { - let "t.MIME_BAD_UNICODE" "1"; - } - let "name_parts" "rsplit(to_lowercase(attach_name), '.')"; - if eval "count(name_parts) > 1" { - let "ext_type" "key_get('spam-mime', name_parts[0])"; - if eval "!is_empty(ext_type)" { - let "ext_type_double" "key_get('spam-mime', name_parts[1])"; - if eval "contains(ext_type, 'BAD')" { - # Bad extension - if eval "contains(ext_type_double, 'BAD')" { - let "t.MIME_DOUBLE_BAD_EXTENSION" "1"; - } else { - let "t.MIME_BAD_EXTENSION" "1"; - } - } - if eval "contains(ext_type, 'AR') && contains(ext_type_double, 'AR')" { - # Archive in archive - let "t.MIME_ARCHIVE_IN_ARCHIVE" "1"; - } - - if eval "contains(ext_type, '/') && - content_type != 'application/octet-stream' && - !contains(split(ext_type, '|'), content_type)" { - # Invalid attachment mime type - let "t.MIME_BAD_ATTACHMENT" "1"; - } - } - } - } - -} - -# Message contains both text and encrypted parts -if eval "has_text_part && (t.ENCRYPTED_SMIME || t.ENCRYPTED_PGP)" { - let "t.BOGUS_ENCRYPTED_AND_TEXT" "1"; -} - -# Message contains only one short part -if eval "parts_num == 1 && parts_max_len < 64" { - let "t.SINGLE_SHORT_PART" "1"; -} elsif eval "parts_max_len == 0" { - let "t.COMPLETELY_EMPTY" "1"; -} - -# Check for mixed script in body -if eval "!is_single_script(text_body)" { - let "t.R_MIXED_CHARSET" "1"; -} diff --git a/resources/config/spamfilter/scripts/pyzor.sieve b/resources/config/spamfilter/scripts/pyzor.sieve deleted file mode 100644 index 875451d0..00000000 --- a/resources/config/spamfilter/scripts/pyzor.sieve +++ /dev/null @@ -1,11 +0,0 @@ -# Check message hash against Pyzor on public.pyzor.org:24441 using a 5 second timeout -let "pyzor_response" "pyzor_check('public.pyzor.org:24441', 5)"; - -if eval "!is_empty(pyzor_response) && pyzor_response[0] == 200" { - let "count" "pyzor_response[1]"; - let "wl_count" "pyzor_response[2]"; - - if eval "count > 5 && (wl_count < 10 || wl_count / count < 0.2)" { - let "t.PYZOR" "1"; - } -} diff --git a/resources/config/spamfilter/scripts/train.sieve b/resources/config/spamfilter/scripts/train.sieve deleted file mode 100644 index 8cae688e..00000000 --- a/resources/config/spamfilter/scripts/train.sieve +++ /dev/null @@ -1,12 +0,0 @@ - - -# Obtain thread name and subject -let "contents" "thread_name(header.subject) + ' ' + body.to_text"; - -if eval "env.train == 'spam'" { - eval "bayes_train(SPAM_DB, contents, true)"; -} elsif eval "env.train == 'ham'" { - eval "bayes_train(SPAM_DB, contents, false)"; -} else { - reject "Missing variable 'train'"; -}