Port Spam filter to Rust - part 5

This commit is contained in:
mdecimus 2024-12-11 17:57:34 +01:00
parent 44ae796d9b
commit b5696c2d26
23 changed files with 1017 additions and 459 deletions

3
Cargo.lock generated
View file

@ -6450,6 +6450,7 @@ dependencies = [
"decancer",
"hyper 1.5.1",
"idna 1.0.3",
"infer 0.16.0",
"mail-auth",
"mail-builder",
"mail-parser",
@ -6457,6 +6458,8 @@ dependencies = [
"nlp",
"psl",
"reqwest 0.12.9",
"sha1",
"sha2 0.10.8",
"smtp-proto",
"store",
"tokio",

View file

@ -4,11 +4,15 @@
* SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
*/
use std::time::Duration;
use std::{net::SocketAddr, time::Duration};
use ahash::AHashSet;
use hyper::HeaderMap;
use mail_parser::HeaderName;
use utils::{config::Config, glob::GlobSet};
use utils::{
config::Config,
glob::{GlobMap, GlobSet},
};
use super::if_block::IfBlock;
@ -19,16 +23,60 @@ pub struct SpamFilterConfig {
pub max_rbl_email_checks: usize,
pub max_rbl_url_checks: usize,
pub greylist_duration: Option<Duration>,
pub pyzor: Option<PyzorConfig>,
pub asn: AsnLookupProvider,
pub list_dmarc_allow: GlobSet,
pub list_spf_dkim_allow: GlobSet,
pub list_freemail_providers: GlobSet,
pub list_disposable_providers: GlobSet,
pub list_trusted_domains: GlobSet,
pub list_url_redirectors: GlobSet,
pub list_file_extensions: GlobMap<FileExtension>,
pub remote_lists: Vec<RemoteListConfig>,
pub dnsbls: Vec<DnsblConfig>,
}
#[derive(Debug, Clone, Default)]
pub enum AsnLookupProvider {
Dns {
ipv4_zone: String,
ipv6_zone: String,
separator: char,
asn_index: usize,
country_index: Option<usize>,
},
Rest {
api: String,
timeout: Duration,
headers: HeaderMap,
asn_path: Vec<String>,
country_path: Option<Vec<String>>,
},
#[default]
None,
}
#[derive(Debug, Clone)]
pub struct PyzorConfig {
pub address: SocketAddr,
pub timeout: Duration,
pub min_count: u64,
pub min_wl_count: u64,
pub ratio: f64,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FileExtension {
pub known_types: AHashSet<String>,
pub is_bad: bool,
pub is_archive: bool,
pub is_nz: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Element {
Url,

View file

@ -82,6 +82,32 @@ pub fn fn_cosine_similarity<'x>(_: &'x Context<'x>, v: Vec<Variable>) -> Variabl
.into()
}
pub fn cosine_similarity(a: &[&str], b: &[&str]) -> f64 {
let mut word_freq: HashMap<&str, [u32; 2]> = HashMap::new();
for (idx, items) in [a, b].into_iter().enumerate() {
for item in items {
word_freq.entry(item).or_insert([0, 0])[idx] += 1;
}
}
let mut dot_product = 0;
let mut magnitude_a = 0;
let mut magnitude_b = 0;
for (_word, count) in word_freq.iter() {
dot_product += count[0] * count[1];
magnitude_a += count[0] * count[0];
magnitude_b += count[1] * count[1];
}
if magnitude_a != 0 && magnitude_b != 0 {
dot_product as f64 / (magnitude_a as f64).sqrt() / (magnitude_b as f64).sqrt()
} else {
0.0
}
}
pub fn fn_jaccard_similarity<'x>(_: &'x Context<'x>, v: Vec<Variable>) -> Variable {
let mut word_freq = [HashSet::new(), HashSet::new()];

View file

@ -291,6 +291,70 @@ impl<T: SessionStream> Session<T> {
}
if self.is_allowed().await {
// Greylist
if let Some(greylist_duration) = self
.server
.core
.spam
.greylist_duration
.filter(|_| self.data.authenticated_as.is_none())
{
let key = format!(
"g:{}:{}:{}",
self.data.remote_ip_str,
self.data.mail_from.as_ref().unwrap().address_lcase,
self.data.rcpt_to.last().unwrap().address_lcase
);
match self
.server
.lookup_store()
.key_exists(key.clone().into_bytes())
.await
{
Ok(true) => (),
Ok(false) => {
match self
.server
.lookup_store()
.key_set(key.into_bytes(), vec![], greylist_duration.as_secs().into())
.await
{
Ok(_) => {
let rcpt = self.data.rcpt_to.pop().unwrap();
trc::event!(
Smtp(SmtpEvent::RcptToGreylisted),
SpanId = self.data.session_id,
To = rcpt.address_lcase,
);
return self
.write(
concat!(
"422 4.2.2 Greylisted, please try ",
"again in a few moments.\r\n"
)
.as_bytes(),
)
.await;
}
Err(err) => {
trc::error!(err
.span_id(self.data.session_id)
.caused_by(trc::location!())
.details("Failed to set greylist."));
}
}
}
Err(err) => {
trc::error!(err
.span_id(self.data.session_id)
.caused_by(trc::location!())
.details("Failed to check greylist."));
}
}
}
trc::event!(
Smtp(SmtpEvent::RcptTo),
SpanId = self.data.session_id,

View file

@ -15,12 +15,16 @@ mail-parser = { version = "0.9", features = ["full_encoding", "ludicrous_mode"]
mail-builder = { version = "0.3", features = ["ludicrous_mode"] }
mail-auth = { version = "0.5" }
mail-send = { version = "0.4", default-features = false, features = ["cram-md5", "ring", "tls12"] }
tokio = { version = "1.23", features = ["net", "macros"] }
psl = "2"
hyper = { version = "1.0.1", features = ["server", "http1", "http2"] }
idna = "1.0"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls-webpki-roots", "http2", "stream"]}
decancer = "3.0.1"
unicode-security = "0.1.0"
infer = "0.16"
sha1 = "0.10"
sha2 = "0.10.6"
[features]
test_mode = []

View file

@ -0,0 +1,323 @@
use std::future::Future;
use common::Server;
use hyper::Uri;
use mail_parser::MimeHeaders;
use nlp::tokenizers::types::{TokenType, TypesTokenizer};
use crate::{modules::html::*, Hostname, SpamFilterContext, TextPart};
pub trait SpamFilterAnalyzeHtml: Sync + Send {
fn spam_filter_analyze_html(
&self,
ctx: &mut SpamFilterContext<'_>,
) -> impl Future<Output = ()> + Send;
}
struct Href {
url_parsed: Option<Uri>,
host: Option<Hostname>,
}
impl SpamFilterAnalyzeHtml for Server {
async fn spam_filter_analyze_html(&self, ctx: &mut SpamFilterContext<'_>) {
// Message only has text/html MIME parts
if ctx.input.message.content_type().map_or(false, |ct| {
ct.ctype().eq_ignore_ascii_case("text")
&& ct
.subtype()
.unwrap_or_default()
.eq_ignore_ascii_case("html")
}) {
ctx.result.add_tag("MIME_HTML_ONLY");
}
let mut last_href: Option<Href> = None;
let mut html_img_words = 0;
let mut html_text_chars = 0;
let mut in_head: i32 = 0;
let mut in_body: i32 = 0;
for (part_id, part) in ctx.output.text_parts.iter().enumerate() {
let is_body_part = ctx.input.message.text_body.contains(&part_id)
|| ctx.input.message.html_body.contains(&part_id);
let (html_tokens, tokens) = if let TextPart::Html {
html_tokens,
tokens,
..
} = part
{
(html_tokens, tokens)
} else {
continue;
};
let mut has_link_to_img = false;
for token in html_tokens {
match token {
HtmlToken::StartTag {
name,
attributes,
is_self_closing,
} => match *name {
A => {
if let Some(attr) = attributes.iter().find_map(|(attr, value)| {
if *attr == HREF {
value.as_deref()
} else {
None
}
}) {
let url = attr.trim().to_lowercase();
let url_parsed = url.parse::<Uri>().ok();
let href = Href {
host: url_parsed
.as_ref()
.and_then(|uri| uri.host().map(Hostname::new)),
url_parsed,
};
if is_body_part
&& attr.starts_with("data:")
&& attr.contains(";base64,")
{
// Has Data URI encoding
ctx.result.add_tag("HAS_DATA_URI");
if attr.contains("text/") {
// Uses Data URI encoding to obfuscate plain or HTML in base64
ctx.result.add_tag("DATA_URI_OBFU");
}
} else if href.host.as_ref().map_or(false, |h| h.ip.is_some()) {
// HTML anchor points to an IP address
ctx.result.add_tag("HTTP_TO_IP");
}
if !*is_self_closing {
last_href = Some(href);
}
}
}
IMG if is_body_part => {
let mut img_width = 800;
let mut img_height = 600;
for (attr, value) in attributes {
if let Some(value) =
value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty())
{
let dimension = match *attr {
WIDTH => &mut img_width,
HEIGHT => &mut img_height,
SRC => {
let src = value.to_ascii_lowercase();
if src.starts_with("data:") && src.contains(";base64,")
{
// Has Data URI encoding
ctx.result.add_tag("Has Data URI encoding");
}
continue;
}
_ => {
continue;
}
};
if let Some(pct) = value.strip_suffix('%') {
if let Ok(pct) = pct.trim().parse::<u64>() {
*dimension = (*dimension * pct) / 100;
}
} else if let Ok(value) = value.parse::<u64>() {
*dimension = value;
}
}
}
let dimensions = img_width + img_height;
if last_href.is_some() && dimensions >= 210 {
has_link_to_img = true;
}
if dimensions > 100 {
// We assume that a single picture 100x200 contains approx 3 words of text
html_img_words += dimensions / 100;
}
}
META => {
let mut has_equiv_refresh = false;
let mut has_content_url = false;
for (attr, value) in attributes {
if let Some(value) =
value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty())
{
if *attr == HTTP_EQUIV {
if value.eq_ignore_ascii_case("refresh") {
has_equiv_refresh = true;
}
} else if *attr == CONTENT
&& value.to_ascii_lowercase().contains("url=")
{
has_content_url = true;
}
}
}
if has_equiv_refresh && has_content_url {
// HTML meta refresh tag
ctx.result.add_tag("HTML_META_REFRESH_URL");
}
}
LINK if is_body_part => {
let mut has_rel_style = false;
let mut has_href_css = false;
for (attr, value) in attributes {
if let Some(value) =
value.as_deref().map(|v| v.trim()).filter(|v| !v.is_empty())
{
if *attr == REL {
if value.to_ascii_lowercase().contains("stylesheet") {
has_rel_style = true;
}
} else if *attr == HREF
&& value.to_ascii_lowercase().ends_with(".css")
{
has_href_css = true;
}
}
}
if has_rel_style || has_href_css {
// Has external CSS
ctx.result.add_tag("EXT_CSS");
}
}
HEAD if !*is_self_closing => {
in_head += 1;
}
BODY if !*is_self_closing => {
in_body += 1;
}
_ => {}
},
HtmlToken::EndTag { name } => match *name {
A => {
last_href = None;
}
HEAD => {
in_head -= 1;
}
BODY => {
in_body -= 1;
}
_ => (),
},
HtmlToken::Text { text } if in_head > 0 => {
if let Some((href_url, href_host)) = last_href
.as_ref()
.and_then(|href| Some((href.url_parsed.as_ref()?, href.host.as_ref()?)))
{
for token in TypesTokenizer::new(text.as_ref())
.tokenize_numbers(false)
.tokenize_urls(true)
.tokenize_urls_without_scheme(true)
.tokenize_emails(true)
{
let text_url = match token.word {
TokenType::Url(url) => url.to_lowercase(),
TokenType::UrlNoScheme(url) => {
format!("http://{}", url.to_lowercase())
}
_ => continue,
};
let text_url_parsed =
if let Ok(text_url_parsed) = text_url.parse::<Uri>() {
text_url_parsed
} else {
continue;
};
if href_url.scheme().map(|s| s.as_str()).unwrap_or_default()
== "http"
&& text_url_parsed
.scheme()
.map(|s| s.as_str())
.unwrap_or_default()
== "https"
{
// The anchor text contains a distinct scheme compared to the target URL
ctx.result.add_tag("HTTP_TO_HTTPS");
}
if let Some(text_url_host) = text_url_parsed.host() {
let text_url_host = Hostname::new(text_url_host);
if text_url_host.sld_or_default() != href_host.sld_or_default()
{
// The anchor text contains a different domain than the target URL
ctx.result.add_tag("PHISHING");
}
}
}
}
if is_body_part {
html_text_chars += text.chars().filter(|t| t.is_alphanumeric()).count();
}
}
_ => (),
}
}
if is_body_part {
if in_head != 0 || in_body != 0 {
// HTML tags are not properly closed
ctx.result.add_tag("HTML_UNBALANCED_TAG");
}
if has_link_to_img {
match html_text_chars {
0..1024 => {
ctx.result.add_tag("HTML_SHORT_LINK_IMG_1");
}
1024..1536 => {
ctx.result.add_tag("HTML_SHORT_LINK_IMG_2");
}
1536..2048 => {
ctx.result.add_tag("HTML_SHORT_LINK_IMG_3");
}
_ => (),
}
}
let mut html_words = 0;
let mut html_uris = 0;
for token in tokens {
match token {
TokenType::Alphabetic(_)
| TokenType::Alphanumeric(_)
| TokenType::Email(_) => {
html_words += 1;
}
TokenType::Url(_) | TokenType::UrlNoScheme(_) => {
html_uris += 1;
}
_ => (),
}
}
if (!has_link_to_img || html_text_chars >= 2048)
&& html_img_words as f64 / (html_words as f64 + html_img_words as f64) > 0.5
{
// Message contains more images than text
ctx.result.add_tag("HTML_TEXT_IMG_RATIO");
}
if html_uris > 0 && html_words == 0 {
// Message only contains URIs in HTML
ctx.result.add_tag("BODY_URI_ONLY");
}
}
}
}
}

View file

@ -13,14 +13,14 @@ use crate::{modules::dnsbl::is_dnsbl, SpamFilterContext, TextPart};
use super::{ElementLocation, SpamFilterResolver};
pub trait SpamFilterAnalyzeIpRev: Sync + Send {
pub trait SpamFilterAnalyzeIp: Sync + Send {
fn spam_filter_analyze_ip(
&self,
ctx: &mut SpamFilterContext<'_>,
) -> impl Future<Output = ()> + Send;
}
impl SpamFilterAnalyzeIpRev for Server {
impl SpamFilterAnalyzeIp for Server {
async fn spam_filter_analyze_ip(&self, ctx: &mut SpamFilterContext<'_>) {
// IP Address RBL
let mut ips =

View file

@ -0,0 +1,426 @@
use std::{collections::HashSet, future::Future, vec};
use common::{
scripts::functions::{array::cosine_similarity, unicode::CharUtils},
Server,
};
use hyper::Uri;
use mail_parser::{HeaderName, MimeHeaders, PartType};
use nlp::tokenizers::types::TokenType;
use unicode_security::MixedScript;
use crate::{Hostname, SpamFilterContext, TextPart};
pub trait SpamFilterAnalyzeMime: Sync + Send {
fn spam_filter_analyze_mime(
&self,
ctx: &mut SpamFilterContext<'_>,
) -> impl Future<Output = ()> + Send;
}
impl SpamFilterAnalyzeMime for Server {
async fn spam_filter_analyze_mime(&self, ctx: &mut SpamFilterContext<'_>) {
let mut has_mime_version = false;
let mut has_ct = false;
let mut has_cte = false;
let mut had_cd = false;
let mut is_plain_text = false;
for header in ctx.input.message.headers() {
match &header.name {
HeaderName::MimeVersion => {
if ctx
.input
.message
.raw_message()
.get(header.offset_field..header.offset_start - 1)
!= Some(b"MIME-Version")
{
ctx.result.add_tag("MV_CASE");
}
has_mime_version = true;
}
HeaderName::ContentType => {
has_ct = true;
is_plain_text = header.value().as_content_type().map_or(false, |ct| {
ct.ctype().eq_ignore_ascii_case("text")
&& ct
.subtype()
.unwrap_or_default()
.eq_ignore_ascii_case("plain")
});
}
HeaderName::ContentTransferEncoding => {
has_cte = true;
}
HeaderName::ContentDisposition => {
had_cd = true;
}
_ => (),
}
}
if !has_mime_version && (has_ct || has_cte) {
ctx.result.add_tag("MISSING_MIME_VERSION");
}
if has_ct && !is_plain_text && !has_cte && !had_cd {
// Only Content-Type header without other MIME headers
ctx.result.add_tag("MIME_HEADER_CTYPE_ONLY");
}
let raw_message = ctx.input.message.raw_message();
let mut has_text_part = false;
let mut is_encrypted = false;
let mut is_encrypted_smime = false;
let mut is_encrypted_pgp = false;
let mut num_parts = 0;
let mut num_parts_size = 0;
for (part_id, part) in ctx.input.message.parts.iter().enumerate() {
let mut ct = None;
let mut cd = None;
let mut ct_type = String::new();
let mut ct_subtype = String::new();
let mut cte = String::new();
let mut is_attachment = ctx.input.message.attachments.contains(&part_id);
let mut has_content_id = false;
for header in part.headers() {
match &header.name {
HeaderName::ContentType => {
if let Some(ct_) = header.value().as_content_type() {
ct_type = ct_.ctype().to_ascii_lowercase();
ct_subtype = ct_.subtype().unwrap_or_default().to_ascii_lowercase();
ct = Some(ct_);
}
if ct_type.is_empty() {
// Content-Type header can't be parsed
ctx.result.add_tag("BROKEN_CONTENT_TYPE");
}
if raw_message
.get(header.offset_start..header.offset_end)
.and_then(|s| s.trim_ascii_end().last())
== Some(&b';')
{
// Content-Type header ends with a semi-colon
ctx.result.add_tag("CT_EXTRA_SEMI");
}
}
HeaderName::ContentTransferEncoding => {
let cte_ = header.value().as_text().unwrap_or_default();
cte = cte_.to_ascii_lowercase();
if cte != cte_ {
ctx.result.add_tag("CTE_CASE");
}
}
HeaderName::ContentDisposition => {
cd = header.value().as_content_type();
}
HeaderName::ContentId => {
has_content_id = true;
}
_ => (),
}
}
match ct_type.as_str() {
"multipart" => {
let part_ids = match &part.body {
PartType::Multipart(parts) => parts.as_slice(),
_ => &[],
};
match ct_subtype.as_str() {
"alternative" => {
let mut has_plain_part = false;
let mut has_html_part = false;
let mut text_part_words = vec![];
let mut text_part_uris = 0;
let mut html_part_words = vec![];
let mut html_part_uris = 0;
for text_part in part_ids.iter().map(|id| &ctx.output.text_parts[*id]) {
match text_part {
TextPart::Plain { tokens, .. } if !has_plain_part => {
words_and_uris(
tokens,
&mut text_part_words,
&mut text_part_uris,
);
has_plain_part = true;
}
TextPart::Html { tokens, .. } if !has_html_part => {
words_and_uris(
tokens,
&mut html_part_words,
&mut html_part_uris,
);
has_html_part = true;
}
_ => (),
}
}
// Multipart message mostly text/html MIME
if has_html_part {
if !has_plain_part {
ctx.result.add_tag("MIME_MA_MISSING_TEXT");
}
} else if has_plain_part {
ctx.result.add_tag("MIME_MA_MISSING_HTML");
}
// HTML and text parts are different
if has_plain_part
&& has_html_part
&& (!text_part_words.is_empty() || !html_part_words.is_empty())
&& cosine_similarity(&text_part_words, &html_part_words) < 0.95
{
ctx.result.add_tag("R_PARTS_DIFFER");
}
// Odd URI count between parts
if text_part_uris != html_part_uris {
ctx.result.add_tag("URI_COUNT_ODD");
}
}
"mixed" => {
let mut num_text_parts = 0;
let mut has_other_parts = false;
for (sub_part_id, sub_part) in part_ids
.iter()
.map(|id| (*id, &ctx.input.message.parts[*id]))
{
let ctype = sub_part
.content_type()
.map(|ct| ct.ctype())
.unwrap_or_default();
if ctype.eq_ignore_ascii_case("text")
&& !ctx.input.message.attachments.contains(&sub_part_id)
{
num_text_parts += 1;
} else if !ctype.eq_ignore_ascii_case("multipart") {
has_other_parts = true;
}
}
// Found multipart/mixed without non-textual part
if !has_other_parts && num_text_parts < 3 {
ctx.result.add_tag("CTYPE_MIXED_BOGUS");
}
}
"encrypted" => {
is_encrypted = true;
}
_ => (),
}
continue;
}
"text" => {
let mut is_7bit = false;
match cte.as_str() {
"" | "7bit" => {
if raw_message
.get(part.raw_body_offset()..part.raw_end_offset())
.map_or(false, |bytes| !bytes.is_ascii())
{
// MIME text part claims to be ASCII but isn't
ctx.result.add_tag("R_BAD_CTE_7BIT");
}
is_7bit = true;
}
"base64" => {
if part.contents().is_ascii() {
// Has text part encoded in base64 that does not contain any 8bit characters
ctx.result.add_tag("MIME_BASE64_TEXT_BOGUS");
} else {
// Has text part encoded in base64
ctx.result.add_tag("MIME_BASE64_TEXT");
}
}
_ => (),
}
if !is_7bit
&& ct_subtype == "plain"
&& ct
.and_then(|ct| ct.attribute("charset"))
.map_or(true, |c| c.is_empty())
{
// Charset header is missing
ctx.result.add_tag("R_MISSING_CHARSET");
}
match &part.body {
PartType::Text(text) | PartType::Html(text)
if ctx.input.message.text_body.contains(&part_id)
|| ctx.input.message.html_body.contains(&part_id) =>
{
if !text.as_ref().is_single_script() {
// Text part contains multiple scripts
ctx.result.add_tag("R_MIXED_CHARSET");
}
}
_ => (),
}
has_text_part = true;
}
"application" => match ct_subtype.as_str() {
"pkcs7-mime" => {
ctx.result.add_tag("ENCRYPTED_SMIME");
is_attachment = false;
is_encrypted_smime = true;
}
"pkcs7-signature" => {
ctx.result.add_tag("SIGNED_SMIME");
is_attachment = false;
}
"pgp-encrypted" => {
ctx.result.add_tag("ENCRYPTED_PGP");
is_attachment = false;
is_encrypted_pgp = true;
}
"pgp-signature" => {
ctx.result.add_tag("SIGNED_PGP");
is_attachment = false;
}
"octet-stream" => {
if !is_encrypted
&& !has_content_id
&& cd.map_or(true, |cd| {
cd.attribute("type")
.unwrap_or_default()
.to_ascii_lowercase()
!= "attachment"
&& !cd.has_attribute("filename")
})
{
ctx.result.add_tag("CTYPE_MISSING_DISPOSITION");
}
}
_ => (),
},
_ => (),
}
num_parts += 1;
num_parts_size += part.len();
let ct_full = format!("{ct_type}/{ct_subtype}");
if is_attachment {
// Has a MIME attachment
ctx.result.add_tag("HAS_ATTACHMENT");
match &part.body {
PartType::Binary(bytes) | PartType::InlineBinary(bytes) => {
if let Some(t) = infer::get(bytes.as_ref()) {
if t.mime_type() != ct_full {
// Known content-type
ctx.result.add_tag("MIME_GOOD");
} else if ct_full != "application/octet-stream" {
// Known bad content-type
ctx.result.add_tag("MIME_BAD");
}
}
}
_ => (),
}
}
// Analyze attachment name
if let Some(attach_name) = part.attachment_name() {
if attach_name.chars().any(|c| c.is_obscured()) {
// Attachment name contains zero-width space
ctx.result.add_tag("MIME_BAD_UNICODE");
}
let attach_name = attach_name.trim().to_lowercase();
if let Some((name, ext)) = attach_name.rsplit_once('.').and_then(|(name, ext)| {
Some((name, self.core.spam.list_file_extensions.get(ext)?))
}) {
let sub_ext = name
.rsplit_once('.')
.and_then(|(_, ext)| self.core.spam.list_file_extensions.get(ext));
if ext.is_bad {
// Attachment has a bad extension
if sub_ext.map_or(false, |e| e.is_bad) {
ctx.result.add_tag("MIME_DOUBLE_BAD_EXTENSION");
} else {
ctx.result.add_tag("MIME_BAD_EXTENSION");
}
}
if ext.is_archive && sub_ext.map_or(false, |e| e.is_archive) {
// Archive in archive
ctx.result.add_tag("MIME_ARCHIVE_IN_ARCHIVE");
}
if !ext.known_types.is_empty()
&& ct_full != "application/octet-stream"
&& !ext.known_types.contains(&ct_full)
{
// Invalid attachment mime type
ctx.result.add_tag("MIME_BAD_ATTACHMENT");
}
}
}
}
match num_parts_size {
0 => {
// Message contains no parts
ctx.result.add_tag("COMPLETELY_EMPTY");
}
1..64 if num_parts == 1 => {
// Message contains only one short part
ctx.result.add_tag("SINGLE_SHORT_PART");
}
_ => (),
}
if has_text_part && (is_encrypted_pgp || is_encrypted_smime) {
// Message contains both text and encrypted parts
ctx.result.add_tag("BOGUS_ENCRYPTED_AND_TEXT");
}
}
}
fn words_and_uris<'x, T: AsRef<str>>(
tokens: &'x [TokenType<T>],
words: &mut Vec<&'x str>,
uri_count: &mut usize,
) {
let mut uris = HashSet::new();
for token in tokens {
match token {
TokenType::Alphabetic(v) | TokenType::Alphanumeric(v) => {
words.push(v.as_ref());
}
TokenType::Url(v) => {
if let Some(host) = v
.as_ref()
.parse::<Uri>()
.ok()
.and_then(|uri| uri.host().map(Hostname::new))
{
uris.insert(host.sld.unwrap_or(host.fqdn));
}
}
_ => (),
}
}
*uri_count = uris.len();
}

View file

@ -19,9 +19,12 @@ pub mod domain;
pub mod ehlo;
pub mod from;
pub mod headers;
pub mod html;
pub mod init;
pub mod ip;
pub mod messageid;
pub mod mime;
pub mod pyzor;
pub mod received;
pub mod recipient;
pub mod replyto;

View file

@ -0,0 +1,35 @@
use std::future::Future;
use common::Server;
use crate::{modules::pyzor::pyzor_check, SpamFilterContext};
pub trait SpamFilterAnalyzePyzor: Sync + Send {
fn spam_filter_analyze_pyzor(
&self,
ctx: &mut SpamFilterContext<'_>,
) -> impl Future<Output = ()> + Send;
}
impl SpamFilterAnalyzePyzor for Server {
async fn spam_filter_analyze_pyzor(&self, ctx: &mut SpamFilterContext<'_>) {
if let Some(config) = &self.core.spam.pyzor {
match pyzor_check(ctx.input.message, config).await {
Ok(Some(result)) => {
if result.code == 200
&& result.count > config.min_count
&& (result.wl_count < config.min_wl_count
|| (result.wl_count as f64 / result.count as f64) < config.ratio)
{
ctx.result.add_tag("PYZOR");
}
let todo = "log time";
}
Ok(None) => {}
Err(err) => {
trc::error!(err.span_id(ctx.input.span_id));
}
}
}
}
}

View file

@ -438,7 +438,7 @@ fn is_single_html_url<T: AsRef<str>>(html_tokens: &[HtmlToken], tokens: &[TokenT
url_count = 0;
for token in html_tokens {
if matches!(token, HtmlToken::StartTag { name, attributes } if *name == A && attributes.iter().any(|(k, _)| *k == HREF))
if matches!(token, HtmlToken::StartTag { name, attributes, .. } if *name == A && attributes.iter().any(|(k, _)| *k == HREF))
{
url_count += 1;
}

View file

@ -5,6 +5,7 @@ pub enum HtmlToken {
StartTag {
name: u64,
attributes: Vec<(u64, Option<String>)>,
is_self_closing: bool,
},
EndTag {
name: u64,
@ -18,10 +19,46 @@ pub enum HtmlToken {
}
pub(crate) const A: u64 = b'a' as u64;
pub(crate) const IMG: u64 = (b'i' as u64) | (b'm' as u64) << 8 | (b'g' as u64) << 16;
pub(crate) const HEAD: u64 =
(b'h' as u64) | (b'e' as u64) << 8 | (b'a' as u64) << 16 | (b'd' as u64) << 24;
pub(crate) const BODY: u64 =
(b'b' as u64) | (b'o' as u64) << 8 | (b'd' as u64) << 16 | (b'y' as u64) << 24;
pub(crate) const META: u64 =
(b'm' as u64) | (b'e' as u64) << 8 | (b't' as u64) << 16 | (b'a' as u64) << 24;
pub(crate) const LINK: u64 =
(b'l' as u64) | (b'i' as u64) << 8 | (b'n' as u64) << 16 | (b'k' as u64) << 24;
pub(crate) const HREF: u64 =
(b'h' as u64) | (b'r' as u64) << 8 | (b'e' as u64) << 16 | (b'f' as u64) << 24;
pub(crate) const SRC: u64 = (b's' as u64) | (b'r' as u64) << 8 | (b'c' as u64) << 16;
pub(crate) const WIDTH: u64 = (b'w' as u64)
| (b'i' as u64) << 8
| (b'd' as u64) << 16
| (b't' as u64) << 24
| (b'h' as u64) << 32;
pub(crate) const HEIGHT: u64 = (b'h' as u64)
| (b'e' as u64) << 8
| (b'i' as u64) << 16
| (b'g' as u64) << 24
| (b'h' as u64) << 32
| (b't' as u64) << 40;
pub(crate) const REL: u64 = (b'r' as u64) | (b'e' as u64) << 8 | (b'l' as u64) << 16;
pub(crate) const CONTENT: u64 = (b'c' as u64)
| (b'o' as u64) << 8
| (b'n' as u64) << 16
| (b't' as u64) << 24
| (b'e' as u64) << 32
| (b'n' as u64) << 40
| (b't' as u64) << 48;
pub(crate) const HTTP_EQUIV: u64 = (b'h' as u64)
| (b't' as u64) << 8
| (b't' as u64) << 16
| (b'p' as u64) << 24
| (b'-' as u64) << 32
| (b'e' as u64) << 40
| (b'q' as u64) << 48
| (b'u' as u64) << 56;
pub fn html_to_tokens(input: &str) -> Vec<HtmlToken> {
let input = input.as_bytes();
@ -106,6 +143,7 @@ pub fn html_to_tokens(input: &str) -> Vec<HtmlToken> {
}
let mut in_quote = false;
let mut is_self_closing = false;
let mut key: u64 = 0;
let mut shift = 0;
@ -123,6 +161,9 @@ pub fn html_to_tokens(input: &str) -> Vec<HtmlToken> {
key |= ((ch - b'A' + b'a') as u64) << shift;
shift += 8;
}
b'/' if !in_quote => {
is_self_closing = true;
}
b'>' if !in_quote => {
if shift != 0 {
if tag == 0 {
@ -205,6 +246,7 @@ pub fn html_to_tokens(input: &str) -> Vec<HtmlToken> {
tags.push(HtmlToken::StartTag {
name: tag,
attributes,
is_self_closing,
});
}
}
@ -292,7 +334,8 @@ mod tests {
tokens,
vec![HtmlToken::StartTag {
name: 7760228,
attributes: vec![]
attributes: vec![],
is_self_closing: false
}]
);
}
@ -325,14 +368,16 @@ mod tests {
vec![
HtmlToken::StartTag {
name: 7760228,
attributes: vec![]
attributes: vec![],
is_self_closing: false
},
HtmlToken::Text {
text: "Hello,".to_string()
},
HtmlToken::StartTag {
name: 1851879539,
attributes: vec![]
attributes: vec![],
is_self_closing: false
},
HtmlToken::Text {
text: " \" world \"".to_string()
@ -358,15 +403,18 @@ mod tests {
attributes: vec![
(1701869940, Some("text".to_string())),
(435761734006, Some("test".to_string()))
]
],
is_self_closing: false
},
HtmlToken::StartTag {
name: 111516266162547,
attributes: vec![]
attributes: vec![],
is_self_closing: true
},
HtmlToken::StartTag {
name: 6647407,
attributes: vec![(1920234593, None)]
attributes: vec![(1920234593, None)],
is_self_closing: true
},
HtmlToken::StartTag {
name: 97,
@ -374,7 +422,8 @@ mod tests {
(98, Some("1".to_string())),
(98, None),
(99, Some("123".to_string()))
]
],
is_self_closing: false
}
]
);

View file

@ -1,4 +1,5 @@
pub mod dnsbl;
pub mod html;
pub mod pyzor;
pub mod remote_list;
pub mod sanitize;

View file

@ -4,16 +4,14 @@
* SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL
*/
use sieve::{runtime::Variable, FunctionMap};
use super::PluginContext;
use std::{
borrow::Cow,
io::Write,
net::SocketAddr,
time::{Duration, SystemTime},
};
use common::config::spamfilter::PyzorConfig;
use mail_parser::{decoders::html::add_html_token, Message, PartType};
use nlp::tokenizers::types::{TokenType, TypesTokenizer};
use sha1::{Digest, Sha1};
@ -24,29 +22,27 @@ const ATOMIC_NUM_LINES: usize = 4;
const DIGEST_SPEC: &[(usize, usize)] = &[(20, 3), (60, 3)];
#[derive(Default, Debug, PartialEq, Eq)]
struct PyzorResponse {
code: u32,
count: u64,
wl_count: u64,
pub(crate) struct PyzorResponse {
pub code: u32,
pub count: u64,
pub wl_count: u64,
}
pub fn register(plugin_id: u32, fnc_map: &mut FunctionMap) {
fnc_map.set_external_function("pyzor_check", plugin_id, 2);
}
pub async fn exec(ctx: PluginContext<'_>) -> trc::Result<Variable> {
pub(crate) async fn pyzor_check(
message: &Message<'_>,
config: &PyzorConfig,
) -> trc::Result<Option<PyzorResponse>> {
// Make sure there is at least one text part
if !ctx
.message
if !message
.parts
.iter()
.any(|p| matches!(p.body, PartType::Text(_) | PartType::Html(_)))
{
return Ok(Variable::default());
return Ok(None);
}
// Hash message
let request = ctx.message.pyzor_check_message();
let request = message.pyzor_check_message();
#[cfg(feature = "test_mode")]
{
@ -74,35 +70,21 @@ pub async fn exec(ctx: PluginContext<'_>) -> trc::Result<Variable> {
}
}
let address = ctx.arguments[0].to_string();
let timeout = Duration::from_secs((ctx.arguments[1].to_integer() as u64).clamp(5, 60));
// Send message to address
pyzor_send_message(address.as_ref(), timeout, &request)
pyzor_send_message(config.address, config.timeout, &request)
.await
.map(Into::into)
.map_err(|err| {
trc::SpamEvent::PyzorError
.into_err()
.ctx(trc::Key::Url, address.to_string())
.ctx(trc::Key::Url, config.address.to_string())
.reason(err)
.details("Pyzor failed")
})
}
impl From<PyzorResponse> for Variable {
fn from(response: PyzorResponse) -> Self {
vec![
Variable::from(response.code),
Variable::from(response.count),
Variable::from(response.wl_count),
]
.into()
}
}
async fn pyzor_send_message(
addr: &str,
addr: SocketAddr,
timeout: Duration,
message: &str,
) -> std::io::Result<PyzorResponse> {
@ -451,7 +433,7 @@ mod test {
async fn send_message() {
assert_eq!(
pyzor_send_message(
"public.pyzor.org:24441",
"public.pyzor.org:24441".parse().unwrap(),
Duration::from_secs(10),
concat!(
"Op: check\n",

View file

@ -442,6 +442,7 @@ impl SmtpEvent {
SmtpEvent::RcptToDuplicate => "Duplicate RCPT TO",
SmtpEvent::RcptToRewritten => "RCPT TO address rewritten",
SmtpEvent::RcptToMissing => "RCPT TO address missing",
SmtpEvent::RcptToGreylisted => "RCPT TO greylisted",
SmtpEvent::TooManyRecipients => "Too many recipients",
SmtpEvent::TooManyInvalidRcpt => "Too many invalid recipients",
SmtpEvent::RawInput => "Raw SMTP input received",
@ -552,6 +553,7 @@ impl SmtpEvent {
}
SmtpEvent::RcptToRewritten => "The envelope recipient address was rewritten",
SmtpEvent::RcptToMissing => "The remote client issued a DATA command before RCPT TO",
SmtpEvent::RcptToGreylisted => "The recipient was greylisted",
SmtpEvent::TooManyRecipients => {
"The remote client exceeded the number of recipients allowed"
}

View file

@ -186,6 +186,7 @@ impl EventType {
| SmtpEvent::MailboxDoesNotExist
| SmtpEvent::RelayNotAllowed
| SmtpEvent::RcptTo
| SmtpEvent::RcptToGreylisted
| SmtpEvent::TooManyInvalidRcpt
| SmtpEvent::Vrfy
| SmtpEvent::VrfyNotFound

View file

@ -393,6 +393,7 @@ pub enum SmtpEvent {
RcptToDuplicate,
RcptToRewritten,
RcptToMissing,
RcptToGreylisted,
TooManyRecipients,
TooManyInvalidRcpt,
RawInput,

View file

@ -866,6 +866,7 @@ impl EventType {
EventType::Security(SecurityEvent::ScanBan) => 558,
EventType::Store(StoreEvent::AzureError) => 559,
EventType::TlsRpt(TlsRptEvent::RecordNotFound) => 560,
EventType::Smtp(SmtpEvent::RcptToGreylisted) => 561,
}
}
@ -1472,6 +1473,7 @@ impl EventType {
558 => Some(EventType::Security(SecurityEvent::ScanBan)),
559 => Some(EventType::Store(StoreEvent::AzureError)),
560 => Some(EventType::TlsRpt(TlsRptEvent::RecordNotFound)),
561 => Some(EventType::Smtp(SmtpEvent::RcptToGreylisted)),
_ => None,
}
}

View file

@ -1,9 +0,0 @@
set "triplet" "g:${env.remote_ip}.${envelope.from}.${envelope.to}";
if eval "!key_exists(SPAM_DB, triplet)" {
# Greylist sender for 30 days
eval "key_set(SPAM_DB, triplet, '', 2592000)";
reject "422 4.2.2 Greylisted, please try again in a few moments.";
stop;
}

View file

@ -1,148 +0,0 @@
# Message only has text/html MIME parts
if eval "header.content-type == 'text/html'" {
let "t.MIME_HTML_ONLY" "1";
}
foreverypart {
if eval "eq_ignore_case(header.content-type, 'text/html')" {
# Tokenize HTML
let "is_body_part" "is_body()";
let "html_tokens" "tokenize(part.text, 'html')";
let "html_tokens_len" "len(html_tokens)";
let "html_char_count" "0";
let "html_space_count" "0";
let "html_img_words" "0";
let "html_words" "0";
let "has_link_to_img" "0";
let "has_uri" "0";
let "has_text" "0";
let "in_head" "0";
let "in_body" "0";
let "in_anchor" "0";
let "in_anchor_href_ip" "0";
let "in_anchor_href" "";
let "i" "0";
while "i < html_tokens_len" {
let "token" "html_tokens[i]";
let "i" "i + 1";
# Tokens starting with '_' are text nodes
if eval "starts_with(token, '_')" {
if eval "in_head == 0" {
let "html_char_count" "html_char_count + count_chars(token)";
let "html_space_count" "html_space_count + count_spaces(token)";
let "text" "to_lowercase(trim(strip_prefix(token, '_')))";
let "html_words" "html_words + len(tokenize(text, 'words'))";
let "uris" "tokenize(text, 'uri')";
if eval "!is_empty(uris)" {
let "has_uri" "1";
let "uri" "uris[0]";
if eval "in_anchor && !is_empty(in_anchor_href)" {
if eval "contains(text, '://') &&
uri_part(uri, 'scheme') != uri_part(in_anchor_href, 'scheme')" {
# The anchor text contains a distinct scheme compared to the target URL
let "t.HTTP_TO_HTTPS" "1";
}
if eval "(!in_anchor_href_ip && (domain_part(uri_part(uri, 'host'), 'sld') != domain_part(uri_part(in_anchor_href, 'host'), 'sld'))) ||
(in_anchor_href_ip && (uri_part(uri, 'host') != uri_part(in_anchor_href, 'host')))" {
let "t.PHISHING" "1";
}
}
} elsif eval "!is_empty(text)" {
let "has_text" "1";
}
}
} elsif eval "starts_with(token, '<img')" {
if eval "is_body_part" {
let "dimensions" "html_attr_size(token, 'width', 800) + html_attr_size(token, 'height', 600)";
if eval "in_anchor && dimensions >= 210" {
let "has_link_to_img" "1";
}
if eval "dimensions > 100" {
# We assume that a single picture 100x200 contains approx 3 words of text
let "html_img_words" "html_img_words + dimensions / 100";
}
let "img_src" "html_attr(token, 'src')";
if eval "starts_with(img_src, 'data:') && contains(img_src, ';base64,')" {
# Has Data URI encoding
let "t.HAS_DATA_URI" "1";
}
}
} elsif eval "starts_with(token, '<head')" {
let "in_head" "in_head + 1";
} elsif eval "starts_with(token, '</head')" {
let "in_head" "in_head - 1";
} elsif eval "starts_with(token, '<body')" {
let "in_body" "in_body + 1";
} elsif eval "starts_with(token, '</body')" {
let "in_body" "in_body - 1";
} elsif eval "starts_with(token, '<a ')" {
let "in_anchor" "1";
let "in_anchor_href_ip" "0";
let "in_anchor_href" "to_lowercase(trim(html_attr(token, 'href')))";
if eval "is_body_part && starts_with(in_anchor_href, 'data:') && contains(in_anchor_href, ';base64,')" {
# Has Data URI encoding
let "t.HAS_DATA_URI" "1";
if eval "contains(in_anchor_href, 'text/')" {
# Uses Data URI encoding to obfuscate plain or HTML in base64
let "t.DATA_URI_OBFU" "1";
}
} elsif eval "is_ip_addr(uri_part(in_anchor_href, 'host'))" {
# HTML anchor points to an IP address
let "t.HTTP_TO_IP" "1";
let "in_anchor_href_ip" "1";
}
} elsif eval "in_anchor && starts_with(token, '</a')" {
let "in_anchor" "0";
} elsif eval "starts_with(token, '<meta ')" {
if eval "eq_ignore_case(html_attr(token, 'http-equiv'), 'refresh') &&
contains_ignore_case(html_attr(token, 'content'), 'url=')" {
# HTML meta refresh tag
let "t.HTML_META_REFRESH_URL" "1";
}
} elsif eval "starts_with(token, '<link') && is_body_part &&
(contains_ignore_case(html_attr(token, 'rel'), 'stylesheet') ||
contains_ignore_case(html_attr(token, 'href'), '.css') )" {
let "t.EXT_CSS" "1";
}
}
if eval "is_body_part" {
# Check for unbalanced tags
if eval "in_head != 0 || in_body != 0" {
let "t.HTML_UNBALANCED_TAG" "1";
}
# Check for short HTML parts with a link to an image
if eval "has_link_to_img" {
if eval "html_char_count < 1024" {
let "t.HTML_SHORT_LINK_IMG_1" "1";
} elsif eval "html_char_count < 1536" {
let "t.HTML_SHORT_LINK_IMG_2" "1";
} elsif eval "html_char_count < 2048" {
let "t.HTML_SHORT_LINK_IMG_3" "1";
}
}
if eval "(!has_link_to_img || html_char_count >= 2048) &&
(html_img_words / (html_words + html_img_words) > 0.5)" {
# Message contains more images than text
let "t.HTML_TEXT_IMG_RATIO" "1";
}
if eval "has_uri && !has_text" {
let "t.BODY_URI_ONLY" "1";
}
}
}
}

View file

@ -1,232 +0,0 @@
if eval "!header.mime-version.exists" {
if eval "header.content-type.exists || header.content-transfer-encoding.exists" {
let "t.MISSING_MIME_VERSION" "1";
}
} elsif eval "header.mime-version.raw_name != 'MIME-Version'" {
let "t.MV_CASE" "1";
}
let "has_text_part" "0";
let "is_encrypted" "0";
let "parts_num" "0";
let "parts_max_len" "0";
if eval "header.Content-Type.exists && !header.Content-Disposition:Content-Transfer-Encoding:MIME-Version.exists && !eq_ignore_case(header.Content-Type, 'text/plain')" {
# Only Content-Type header without other MIME headers
let "t.MIME_HEADER_CTYPE_ONLY" "1";
}
foreverypart {
let "content_type" "to_lowercase(header.content-type)";
let "type" "to_lowercase(header.content-type.type)";
let "subtype" "to_lowercase(header.content-type.subtype)";
let "cte" "header.content-transfer-encoding";
let "part_is_attachment" "is_attachment()";
if eval "cte != '' && !is_lowercase(cte)" {
let "cte" "to_lowercase(cte)";
let "t.CTE_CASE" "1";
}
if eval "ends_with(header.content-type.raw, ';')" {
# Content-Type header ends with a semi-colon
let "t.CT_EXTRA_SEMI" "1";
}
if eval "type == 'multipart'" {
if eval "subtype == 'alternative'" {
let "has_plain_part" "0";
let "has_html_part" "0";
let "text_part_words" "";
let "text_part_uris" "0";
let "html_part_words" "";
let "html_part_uris" "0";
foreverypart {
let "ma_ct" "to_lowercase(header.content-type)";
if eval "!has_plain_part && ma_ct == 'text/plain'" {
let "text_part" "part.text";
let "text_part_words" "tokenize(text_part, 'words')";
let "text_part_uris" "count(dedup(uri_part(tokenize(text_part, 'uri_strict'), 'host')))";
let "has_plain_part" "1";
} elsif eval "!has_html_part && ma_ct == 'text/html'" {
let "html_part" "html_to_text(part.text)";
let "html_part_words" "tokenize(html_part, 'words')";
let "html_part_uris" "count(dedup(uri_part(tokenize(part.text, 'uri_strict'), 'host')))";
let "has_html_part" "1";
}
}
# Multipart message mostly text/html MIME
if eval "has_html_part" {
if eval "!has_plain_part" {
let "t.MIME_MA_MISSING_TEXT" "1";
}
} elsif eval "has_plain_part" {
let "t.MIME_MA_MISSING_HTML" "1";
}
# HTML and text parts are different
if eval "!t.R_PARTS_DIFFER && has_html_part && has_plain_part &&
(!is_empty(text_part_words) || !is_empty(html_part_words)) &&
cosine_similarity(text_part_words, html_part_words) < 0.95" {
let "t.R_PARTS_DIFFER" "1";
}
# Odd URI count between parts
if eval "text_part_uris != html_part_uris" {
set "t.URI_COUNT_ODD" "1";
}
} elsif eval "subtype == 'mixed'" {
let "num_text_parts" "0";
let "has_other_part" "0";
foreverypart {
if eval "eq_ignore_case(header.content-type.type, 'text') && !is_attachment()" {
let "num_text_parts" "num_text_parts + 1";
} elsif eval "!eq_ignore_case(header.content-type.type, 'multipart')" {
let "has_other_part" "1";
}
}
# Found multipart/mixed without non-textual part
if eval "!has_other_part && num_text_parts < 3" {
let "t.CTYPE_MIXED_BOGUS" "1";
}
} elsif eval "subtype == 'encrypted'" {
set "is_encrypted" "1";
}
} else {
if eval "type == 'text'" {
# MIME text part claims to be ASCII but isn't
if eval "cte == '' || cte == '7bit'" {
if eval "!is_ascii(part.raw)" {
let "t.R_BAD_CTE_7BIT" "1";
}
} else {
if eval "cte == 'base64'" {
if eval "is_ascii(part.text)" {
# Has text part encoded in base64 that does not contain any 8bit characters
let "t.MIME_BASE64_TEXT_BOGUS" "1";
} else {
# Has text part encoded in base64
let "t.MIME_BASE64_TEXT" "1";
}
}
if eval "subtype == 'plain' && is_empty(header.content-type.attr.charset)" {
# Charset header is missing
let "t.R_MISSING_CHARSET" "1";
}
}
let "has_text_part" "1";
} elsif eval "type == 'application'" {
if eval "subtype == 'pkcs7-mime'" {
let "t.ENCRYPTED_SMIME" "1";
let "part_is_attachment" "0";
} elsif eval "subtype == 'pkcs7-signature'" {
let "t.SIGNED_SMIME" "1";
let "part_is_attachment" "0";
} elsif eval "subtype == 'pgp-encrypted'" {
let "t.ENCRYPTED_PGP" "1";
let "part_is_attachment" "0";
} elsif eval "subtype == 'pgp-signature'" {
let "t.SIGNED_PGP" "1";
let "part_is_attachment" "0";
} elsif eval "subtype == 'octet-stream'" {
if eval "!is_encrypted &&
!header.content-id.exists &&
(!header.content-disposition.exists ||
(!eq_ignore_case(header.content-disposition.type, 'attachment') &&
is_empty(header.content-disposition.attr.filename)))" {
let "t.CTYPE_MISSING_DISPOSITION" "1";
}
}
}
# Increase part count
let "parts_num" "parts_num + 1";
if eval "parts_num == 1" {
let "parts_len" "mime_part_len()";
if eval "parts_len > parts_max_len" {
let "parts_max_len" "parts_len";
}
}
}
if eval "is_empty(type) && header.content-type.exists" {
let "t.BROKEN_CONTENT_TYPE" "1";
}
if eval "part_is_attachment" {
# Has a MIME attachment
let "t.HAS_ATTACHMENT" "1";
# Detect and compare mime type
let "detected_mime_type" "detect_file_type('mime')";
if eval "!is_empty(detected_mime_type)" {
if eval "detected_mime_type == content_type" {
# Known content-type
let "t.MIME_GOOD" "1";
} elsif eval "content_type != 'application/octet-stream'" {
# Known bad content-type
let "t.MIME_BAD" "1";
}
}
}
# Analyze attachment name
let "attach_name" "attachment_name()";
if eval "!is_empty(attach_name)" {
if eval "has_obscured(attach_name)" {
let "t.MIME_BAD_UNICODE" "1";
}
let "name_parts" "rsplit(to_lowercase(attach_name), '.')";
if eval "count(name_parts) > 1" {
let "ext_type" "key_get('spam-mime', name_parts[0])";
if eval "!is_empty(ext_type)" {
let "ext_type_double" "key_get('spam-mime', name_parts[1])";
if eval "contains(ext_type, 'BAD')" {
# Bad extension
if eval "contains(ext_type_double, 'BAD')" {
let "t.MIME_DOUBLE_BAD_EXTENSION" "1";
} else {
let "t.MIME_BAD_EXTENSION" "1";
}
}
if eval "contains(ext_type, 'AR') && contains(ext_type_double, 'AR')" {
# Archive in archive
let "t.MIME_ARCHIVE_IN_ARCHIVE" "1";
}
if eval "contains(ext_type, '/') &&
content_type != 'application/octet-stream' &&
!contains(split(ext_type, '|'), content_type)" {
# Invalid attachment mime type
let "t.MIME_BAD_ATTACHMENT" "1";
}
}
}
}
}
# Message contains both text and encrypted parts
if eval "has_text_part && (t.ENCRYPTED_SMIME || t.ENCRYPTED_PGP)" {
let "t.BOGUS_ENCRYPTED_AND_TEXT" "1";
}
# Message contains only one short part
if eval "parts_num == 1 && parts_max_len < 64" {
let "t.SINGLE_SHORT_PART" "1";
} elsif eval "parts_max_len == 0" {
let "t.COMPLETELY_EMPTY" "1";
}
# Check for mixed script in body
if eval "!is_single_script(text_body)" {
let "t.R_MIXED_CHARSET" "1";
}

View file

@ -1,11 +0,0 @@
# Check message hash against Pyzor on public.pyzor.org:24441 using a 5 second timeout
let "pyzor_response" "pyzor_check('public.pyzor.org:24441', 5)";
if eval "!is_empty(pyzor_response) && pyzor_response[0] == 200" {
let "count" "pyzor_response[1]";
let "wl_count" "pyzor_response[2]";
if eval "count > 5 && (wl_count < 10 || wl_count / count < 0.2)" {
let "t.PYZOR" "1";
}
}

View file

@ -1,12 +0,0 @@
# Obtain thread name and subject
let "contents" "thread_name(header.subject) + ' ' + body.to_text";
if eval "env.train == 'spam'" {
eval "bayes_train(SPAM_DB, contents, true)";
} elsif eval "env.train == 'ham'" {
eval "bayes_train(SPAM_DB, contents, false)";
} else {
reject "Missing variable 'train'";
}