From dbaaff48f0b7b4575876a9dfb094d5264c7a5a3e Mon Sep 17 00:00:00 2001 From: Mauro D Date: Sun, 20 Aug 2023 17:44:26 +0200 Subject: [PATCH] AST parser for SpamAssassin meta expressions --- crates/antispam/src/import/ast.rs | 177 ++++++ crates/antispam/src/import/meta.rs | 592 +++++++-------------- crates/antispam/src/import/mod.rs | 69 ++- crates/antispam/src/import/spamassassin.rs | 6 +- crates/antispam/src/import/tokenizer.rs | 181 +++++++ 5 files changed, 612 insertions(+), 413 deletions(-) create mode 100644 crates/antispam/src/import/ast.rs create mode 100644 crates/antispam/src/import/tokenizer.rs diff --git a/crates/antispam/src/import/ast.rs b/crates/antispam/src/import/ast.rs new file mode 100644 index 00000000..3b46098b --- /dev/null +++ b/crates/antispam/src/import/ast.rs @@ -0,0 +1,177 @@ +use super::{BinaryOperator, Comparator, Expr, Logical, Operation, Token, UnaryOperator}; + +pub struct Parser<'x> { + tokens: &'x [Token], + position: usize, +} + +impl<'x> Parser<'x> { + pub fn new(tokens: &'x [Token]) -> Self { + Self { + tokens, + position: 0, + } + } + + pub fn consume(&mut self) -> Option<&'x Token> { + if self.position < self.tokens.len() { + let token = &self.tokens[self.position]; + self.position += 1; + Some(token) + } else { + None + } + } + + pub fn peek(&self) -> Option<&'x Token> { + if self.position < self.tokens.len() { + Some(&self.tokens[self.position]) + } else { + None + } + } + + fn primary(&mut self) -> Result { + match self.peek() { + Some(&Token::Number(n)) => { + self.consume(); + Ok(Expr::Literal(n)) + } + Some(Token::Tag(ref id)) => { + self.consume(); + Ok(Expr::Identifier(id.clone())) + } + Some(&Token::OpenParen) => { + self.consume(); + let expr = self.expr(); + if let Some(&Token::CloseParen) = self.peek() { + self.consume(); + expr + } else { + Err("Expected closing parenthesis".to_string()) + } + } + _ => Err("Unexpected token in factor".to_string()), + } + } + + fn unary(&mut self) -> Result { + match self.peek() { + Some(&Token::Logical(Logical::Not)) => { + self.consume(); + let operand = self.primary()?; + Ok(Expr::UnaryOp(UnaryOperator::Not, Box::new(operand))) + } + Some(&Token::Operation(Operation::Subtract)) => { + self.consume(); + let operand = self.primary()?; + Ok(Expr::UnaryOp(UnaryOperator::Minus, Box::new(operand))) + } + _ => self.primary(), + } + } + + fn factor(&mut self) -> Result { + let mut left = self.unary()?; + + while let Some(op @ Token::Operation(Operation::Multiply | Operation::Divide)) = self.peek() + { + self.consume(); + let right = self.unary()?; + left = Expr::BinaryOp(Box::new(left), op.into(), Box::new(right)); + } + + Ok(left) + } + + fn term(&mut self) -> Result { + let mut left = self.factor()?; + + while let Some(op @ Token::Operation(Operation::Add | Operation::Subtract)) = self.peek() { + self.consume(); + let right = self.factor()?; + left = Expr::BinaryOp(Box::new(left), op.into(), Box::new(right)); + } + + Ok(left) + } + + fn bitwise(&mut self) -> Result { + let mut left = self.term()?; + while let Some(op @ Token::Operation(Operation::And | Operation::Or)) = self.peek() { + self.consume(); + let right = self.term()?; + left = Expr::BinaryOp(Box::new(left), op.into(), Box::new(right)); + } + Ok(left) + } + + fn comparison(&mut self) -> Result { + let mut left = self.bitwise()?; + + while let Some(op @ Token::Comparator(_)) = self.peek() { + self.consume(); + let right = self.bitwise()?; + left = Expr::BinaryOp(Box::new(left), op.into(), Box::new(right)); + } + + Ok(left) + } + + fn logical_and(&mut self) -> Result { + let mut left = self.comparison()?; + + while let Some(Token::Logical(Logical::And)) = self.peek() { + self.consume(); + let right = self.comparison()?; + left = Expr::BinaryOp(Box::new(left), BinaryOperator::And, Box::new(right)); + } + Ok(left) + } + + fn logical_or(&mut self) -> Result { + let mut left = self.logical_and()?; + + while let Some(Token::Logical(Logical::Or)) = self.peek() { + self.consume(); + let right = self.logical_and()?; + left = Expr::BinaryOp(Box::new(left), BinaryOperator::Or, Box::new(right)); + } + Ok(left) + } + + fn expr(&mut self) -> Result { + self.logical_or() + } + + pub fn parse(&mut self) -> Result { + let result = self.expr()?; + if self.position < self.tokens.len() { + println!("{result:#?}\n {} {}", self.position, self.tokens.len()); + Err("Unexpected tokens at the end of the expression".to_string()) + } else { + Ok(result) + } + } +} + +impl From<&Token> for BinaryOperator { + fn from(value: &Token) -> Self { + match value { + Token::Operation(Operation::Add) => Self::Add, + Token::Operation(Operation::Multiply) => Self::Multiply, + Token::Operation(Operation::Divide) => Self::Divide, + Token::Operation(Operation::Subtract) => Self::Subtract, + Token::Operation(Operation::And) => Self::BitwiseAnd, + Token::Operation(Operation::Or) => Self::BitwiseOr, + Token::Logical(Logical::And) => Self::And, + Token::Logical(Logical::Or) => Self::Or, + Token::Comparator(Comparator::Gt) => Self::Greater, + Token::Comparator(Comparator::Lt) => Self::Lesser, + Token::Comparator(Comparator::Ge) => Self::GreaterOrEqual, + Token::Comparator(Comparator::Le) => Self::LesserOrEqual, + Token::Comparator(Comparator::Eq) => Self::Equal, + _ => panic!("Invalid token"), + } + } +} diff --git a/crates/antispam/src/import/meta.rs b/crates/antispam/src/import/meta.rs index 199a1f56..e7c40773 100644 --- a/crates/antispam/src/import/meta.rs +++ b/crates/antispam/src/import/meta.rs @@ -1,336 +1,40 @@ -use std::{collections::HashMap, fmt::Display, iter::Peekable, str::Chars}; +use std::fmt::Display; -use super::{Comparator, Logical, Operation, Token}; - -// Parse a meta expression into a list of tokens that can be easily -// converted into a Sieve test. -// The parser is not very robust but works on all SpamAssassin meta expressions. -// It might be a good idea in the future to instead build a parse tree and -// then convert that into a Sieve expression. - -#[derive(Debug, Clone, Default)] -pub struct MetaExpression { - pub tokens: Vec, - depth_range: HashMap, - depth: u32, -} - -#[derive(Debug, Clone)] -pub struct TokenDepth { - pub token: Token, - depth: u32, - prefix: Vec, -} - -#[derive(Debug, Clone, Default)] -struct DepthRange { - start: usize, - end: usize, - expr_end: Option<(usize, bool)>, - logic_end: bool, -} +use super::{ + ast::Parser, tokenizer::Tokenizer, BinaryOperator, Comparator, Expr, Logical, MetaExpression, + Operation, Token, UnaryOperator, UnwrapResult, +}; +// Parse a meta expression into a list of tokens that can be converted into a Sieve test. impl MetaExpression { pub fn from_meta(expr: &str) -> Self { - let mut meta = MetaExpression::default(); - let mut seen_comp = false; - let mut buf = String::new(); - let mut iter = expr.chars().peekable(); + let mut tokens = Tokenizer::new(expr).collect::>(); - while let Some(ch) = iter.next() { - match ch { - 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' => { - buf.push(ch); - } - _ => { - if !buf.is_empty() { - let token = Token::from(buf); - buf = String::new(); - if !seen_comp && !meta.has_comparator(iter.clone()) { - meta.push(token); - meta.push(Token::Comparator(Comparator::Gt)); - meta.push(Token::Number(0)); - seen_comp = true; - } else { - meta.push(token); - } + // If there are no comparators, we can just turn it into am expression + if !tokens.iter().any(|t| matches!(t, Token::Comparator(_))) { + let prev_tokens = tokens; + tokens = Vec::with_capacity(prev_tokens.len() + 3); + tokens.push(Token::OpenParen); + for token in prev_tokens { + tokens.push(if let Token::Logical(op) = token { + match op { + Logical::And => Token::Operation(Operation::And), + Logical::Or => Token::Operation(Operation::Or), + Logical::Not => Token::Logical(Logical::Not), } - - match ch { - '&' => { - seen_comp = false; - if matches!(iter.next(), Some('&')) { - meta.push(Token::Logical(Logical::And)); - } else { - eprintln!("Warning: Single & in meta expression {expr}",); - } - } - '|' => { - seen_comp = false; - if matches!(iter.next(), Some('|')) { - meta.push(Token::Logical(Logical::Or)); - } else { - eprintln!("Warning: Single | in meta expression {expr}",); - } - } - '!' => { - seen_comp = false; - meta.push(Token::Logical(Logical::Not)) - } - '=' => { - seen_comp = true; - meta.push(match iter.next() { - Some('=') => Token::Comparator(Comparator::Eq), - Some('>') => Token::Comparator(Comparator::Ge), - Some('<') => Token::Comparator(Comparator::Le), - _ => { - eprintln!("Warning: Single = in meta expression {expr}",); - Token::Comparator(Comparator::Eq) - } - }); - } - '>' => { - seen_comp = true; - meta.push(match iter.peek() { - Some('=') => { - iter.next(); - Token::Comparator(Comparator::Ge) - } - _ => Token::Comparator(Comparator::Gt), - }) - } - '<' => { - seen_comp = true; - meta.push(match iter.peek() { - Some('=') => { - iter.next(); - Token::Comparator(Comparator::Le) - } - _ => Token::Comparator(Comparator::Lt), - }) - } - '(' => meta.push(Token::OpenParen), - ')' => { - if meta.depth == 0 { - eprintln!( - "Warning: Unmatched close parenthesis in meta expression {expr}" - ); - } - - meta.push(Token::CloseParen) - } - '+' => meta.push(Token::Operation(Operation::Add)), - '*' => meta.push(Token::Operation(Operation::Multiply)), - '/' => meta.push(Token::Operation(Operation::Divide)), - ' ' => {} - _ => { - eprintln!("Warning: Invalid character {ch} in meta expression {expr}"); - break; - } - } - } - } - } - - if meta.depth > 0 { - eprintln!("Warning: Unmatched open parenthesis in meta expression {expr}"); - } - - if !buf.is_empty() { - meta.push(Token::from(buf)); - if !seen_comp { - meta.push(Token::Comparator(Comparator::Gt)); - meta.push(Token::Number(0)); - } - } - - meta.finalize(); - meta - } - - fn push(&mut self, mut token: Token) { - let pos = self.tokens.len(); - let depth_range = self - .depth_range - .entry(self.depth) - .or_insert_with(|| DepthRange { - start: pos, - end: pos, - ..Default::default() - }); - depth_range.end = pos; - let mut depth = self.depth; - let mut prefix = vec![]; - - match &token { - Token::OpenParen => { - if let Some((pos, true)) = depth_range.expr_end { - depth_range.expr_end = Some((pos, false)); - } - self.depth += 1; - } - Token::CloseParen => { - if let Some((pos, is_static)) = depth_range.expr_end.take() { - self.tokens[pos + 2] - .prefix - .push(Token::BeginExpression(is_static)); - prefix.push(Token::EndExpression(is_static)); - } - if depth_range.logic_end { - prefix.push(Token::CloseParen); - } - self.depth = self.depth.saturating_sub(1); - depth = self.depth; - } - Token::Logical(op) => { - if self - .tokens - .iter() - .any(|t| matches!(t.token, Token::Comparator(_)) && t.depth < depth) - { - token = Token::Operation(match op { - Logical::And => Operation::And, - Logical::Or => Operation::Or, - Logical::Not => Operation::Not, - }); - if let Some((pos, true)) = depth_range.expr_end { - depth_range.expr_end = Some((pos, false)); - } - } else if matches!(op, Logical::Or | Logical::And) { - let start_prefix = &mut self.tokens[depth_range.start].prefix; - if !start_prefix.contains(&Token::Logical(Logical::And)) - && !start_prefix.contains(&Token::Logical(Logical::Or)) - { - start_prefix.insert(0, token.clone()); - } - depth_range.logic_end = true; - if let Some((pos, is_static)) = depth_range.expr_end.take() { - self.tokens[pos + 2] - .prefix - .push(Token::BeginExpression(is_static)); - prefix.push(Token::EndExpression(is_static)); - } - } - } - Token::Comparator(_) => { - let mut is_static = true; - let mut start_pos = usize::MAX; - for (pos, token) in self.tokens.iter_mut().enumerate().rev() { - if token.depth >= depth { - start_pos = pos; - match &token.token { - Token::Logical(op) => { - if token.depth == depth { - start_pos += 1; - break; - } else { - is_static = false; - token.token = Token::Operation(match op { - Logical::And => Operation::And, - Logical::Or => Operation::Or, - Logical::Not => Operation::Not, - }); - token.prefix.clear(); - } - } - Token::OpenParen - | Token::CloseParen - | Token::Operation(_) - | Token::Tag(_) => { - is_static = false; - } - _ => {} - } - } else { - break; - } - } - if start_pos != usize::MAX { - self.tokens.push(TokenDepth { - token: Token::EndExpression(is_static), - depth, - prefix: vec![], - }); - self.tokens[start_pos].prefix = - vec![token.clone(), Token::BeginExpression(is_static)]; - depth_range.expr_end = Some((pos, true)); - } - } - Token::Tag(_) | Token::Operation(_) => { - if let Some((pos, true)) = depth_range.expr_end { - depth_range.expr_end = Some((pos, false)); - } - } - _ => {} - } - self.tokens.push(TokenDepth { - token, - depth, - prefix, - }) - } - - fn finalize(&mut self) { - if let Some(depth_range) = self.depth_range.get(&self.depth) { - if let Some((pos, is_static)) = depth_range.expr_end { - self.tokens[pos + 2] - .prefix - .push(Token::BeginExpression(is_static)); - self.tokens.push(TokenDepth { - token: Token::EndExpression(is_static), - depth: self.depth, - prefix: vec![], + } else { + token }); } - if depth_range.logic_end { - self.tokens.push(TokenDepth { - token: Token::CloseParen, - depth: self.depth, - prefix: vec![], - }); - } - } - } - - fn has_comparator(&self, iter: Peekable>) -> bool { - let mut d = self.depth; - let mut comp_depth = None; - let mut logic_depth = None; - - for (pos, ch) in iter.enumerate() { - match ch { - '(' => { - d += 1; - } - ')' => { - d = d.saturating_sub(1); - } - '>' | '<' | '=' => { - comp_depth = Some((pos, d)); - break; - } - '&' | '|' => { - if d <= self.depth { - logic_depth = Some((pos, d)); - } - } - _ => (), - } + tokens.push(Token::CloseParen); + tokens.push(Token::Comparator(Comparator::Gt)); + tokens.push(Token::Number(0)); } - println!("comp_depth: {comp_depth:?} {logic_depth:?}"); - - match (comp_depth, logic_depth) { - (Some((comp_pos, comp_depth)), Some((logic_pos, logic_depth))) => { - match comp_depth.cmp(&logic_depth) { - std::cmp::Ordering::Less => true, - std::cmp::Ordering::Equal => comp_pos < logic_pos, - _ => false, - } - } - (Some(_), None) => true, - _ => false, - } + let expr = Parser::new(&tokens) + .parse() + .unwrap_result("parse expression"); + MetaExpression { tokens, expr } } } @@ -344,112 +48,192 @@ impl From for Token { } } -impl Display for MetaExpression { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str("if ")?; - - for token in &self.tokens { - for token in &token.prefix { - token.fmt(f)?; +impl Expr { + fn fmt_child( + &self, + f: &mut std::fmt::Formatter<'_>, + parent: Option<&BinaryOperator>, + in_comp: bool, + ) -> std::fmt::Result { + match self { + Expr::UnaryOp(op, expr) => { + let add_p = + in_comp && !matches!(expr.as_ref(), Expr::Literal(_) | Expr::Identifier(_)); + match op { + UnaryOperator::Not => f.write_str(if in_comp { "!" } else { "not " })?, + UnaryOperator::Minus => f.write_str("-")?, + } + if add_p { + f.write_str("(")?; + } + expr.fmt_child(f, None, in_comp)?; + if add_p { + f.write_str(")")?; + } + Ok(()) + } + Expr::BinaryOp(left, op, right) => match op { + BinaryOperator::Or | BinaryOperator::And => { + let add_p = parent.map_or(true, |pop| pop.precedence() != op.precedence()); + if add_p { + write!(f, "{op}(")?; + } + left.fmt_child(f, op.into(), in_comp)?; + f.write_str(", ")?; + right.fmt_child(f, op.into(), in_comp)?; + if add_p { + f.write_str(")") + } else { + Ok(()) + } + } + BinaryOperator::Greater + | BinaryOperator::Lesser + | BinaryOperator::GreaterOrEqual + | BinaryOperator::LesserOrEqual + | BinaryOperator::Equal => { + write!(f, "string :value {op} :comparator \"i;ascii-numeric\" \"")?; + let is_literal = matches!(left.as_ref(), Expr::Literal(_)); + if !is_literal { + f.write_str("${")?; + } + left.fmt_child(f, None, true)?; + if !is_literal { + f.write_str("}")?; + } + f.write_str("\" \"")?; + let is_literal = matches!(right.as_ref(), Expr::Literal(_)); + if !is_literal { + f.write_str("${")?; + } + right.fmt_child(f, None, true)?; + if !is_literal { + f.write_str("}")?; + } + f.write_str("\"") + } + BinaryOperator::Add + | BinaryOperator::Subtract + | BinaryOperator::Multiply + | BinaryOperator::Divide + | BinaryOperator::BitwiseAnd + | BinaryOperator::BitwiseOr => { + let add_p = parent.map_or(false, |pop| pop.precedence() != op.precedence()); + if add_p { + f.write_str("(")?; + } + left.fmt_child(f, op.into(), in_comp)?; + op.fmt(f)?; + right.fmt_child(f, op.into(), in_comp)?; + if add_p { + f.write_str(")")?; + } + Ok(()) + } + }, + Expr::Literal(v) => { + if !in_comp { + write!( + f, + "string :value \"gt\" :comparator \"i;ascii-numeric\" \"{v}\" \"0\"" + ) + } else { + v.fmt(f) + } + } + Expr::Identifier(i) => { + if !in_comp { + write!( + f, + "string :value \"gt\" :comparator \"i;ascii-numeric\" \"${{{i}}}\" \"0\"", + ) + } else { + i.fmt(f) + } } - - match &token.token { - Token::Logical(Logical::And) | Token::Logical(Logical::Or) => f.write_str(", "), - Token::Comparator(Comparator::Gt) - | Token::Comparator(Comparator::Lt) - | Token::Comparator(Comparator::Eq) - | Token::Comparator(Comparator::Ge) - | Token::Comparator(Comparator::Le) => f.write_str(" "), - _ => token.token.fmt(f), - }?; } - - Ok(()) } } -impl Display for Token { +impl Display for MetaExpression { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("if ")?; + self.expr.fmt_child(f, None, false) + } +} + +impl Display for BinaryOperator { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Token::Tag(t) => t.fmt(f), - Token::Number(n) => n.fmt(f), - Token::Logical(Logical::And) => f.write_str("allof("), - Token::Logical(Logical::Or) => f.write_str("anyof("), - Token::Logical(Logical::Not) => f.write_str("not "), - Token::Comparator(comp) => { - f.write_str("string :value \"")?; - match comp { - Comparator::Eq => f.write_str("eq")?, - Comparator::Gt => f.write_str("gt")?, - Comparator::Lt => f.write_str("lt")?, - Comparator::Ge => f.write_str("ge")?, - Comparator::Le => f.write_str("gt")?, - _ => unreachable!(), - } - f.write_str("\" :comparator \"i;ascii-numeric\" ") - } - - Token::OpenParen => f.write_str("("), - Token::CloseParen => f.write_str(")"), - Token::Operation(Operation::Add) => f.write_str(" + "), - Token::Operation(Operation::Multiply) => f.write_str(" * "), - Token::Operation(Operation::Divide) => f.write_str(" / "), - Token::Operation(Operation::And) => f.write_str(" & "), - Token::Operation(Operation::Or) => f.write_str(" | "), - Token::Operation(Operation::Not) => f.write_str("!"), - Token::BeginExpression(is_static) => { - if *is_static { - f.write_str("\"") - } else { - f.write_str("\"${") - } - } - Token::EndExpression(is_static) => { - if *is_static { - f.write_str("\"") - } else { - f.write_str("}\"") - } - } + BinaryOperator::Or => f.write_str("anyof"), + BinaryOperator::And => f.write_str("allof"), + BinaryOperator::BitwiseOr => f.write_str(" | "), + BinaryOperator::BitwiseAnd => f.write_str(" & "), + BinaryOperator::Greater => f.write_str("\"gt\""), + BinaryOperator::Lesser => f.write_str("\"lt\""), + BinaryOperator::GreaterOrEqual => f.write_str("\"ge\""), + BinaryOperator::LesserOrEqual => f.write_str("\"le\""), + BinaryOperator::Equal => f.write_str("\"eq\""), + BinaryOperator::Add => f.write_str(" + "), + BinaryOperator::Subtract => f.write_str(" - "), + BinaryOperator::Multiply => f.write_str(" * "), + BinaryOperator::Divide => f.write_str(" / "), } } } #[cfg(test)] mod test { - use super::MetaExpression; + use crate::import::MetaExpression; #[test] fn parse_meta() { - for (expr, expected) in [ - /*( + for (pos, (expr, expected)) in [ + ( concat!( - "( ! HTML_IMAGE_ONLY_16 ) && ", - "( __LOWER_E > 20 ) && ", - "( __E_LIKE_LETTER > ( (__LOWER_E * 14 ) / 10) ) && ", - "( __E_LIKE_LETTER < ( 10 * __LOWER_E ) )" + "( ! A ) && ", + "( B > 20 ) && ", + "( C > ( (D * 14 ) / 10) ) && ", + "( E < ( 10 * F ) )" ), "", ), - ("(__DRUGS_ERECTILE1 || __DRUGS_ERECTILE2)", ""), - ("(__HELO_DYNAMIC_IPADDR && !HELO_STATIC_HOST)", ""), - ("__ML2 || __ML4", ""), - ("(__AT_HOTMAIL_MSGID && (!__FROM_HOTMAIL_COM && !__FROM_MSN_COM && !__FROM_YAHOO_COM))", ""), + ("(A || B)", ""), + ("(A && !B)", ""), + ("A || B", ""), + ("(A && (!B && !C && !D))", ""), ("(0)", ""), - ("RAZOR2_CHECK + DCC_CHECK + PYZOR_CHECK > 1", ""), - ("(SUBJECT_IN_BLOCKLIST)", ""), - ("__HAS_MSGID && !(__SANE_MSGID || __MSGID_COMMENT)", ""), - ("!__CTYPE_HTML && __X_MAILER_APPLEMAIL && (__MSGID_APPLEMAIL || __MIME_VERSION_APPLEMAIL)", ""), - ("((__AUTO_GEN_MS||__AUTO_GEN_3||__AUTO_GEN_4) && !__XM_VBULLETIN && !__X_CRON_ENV)", ""),*/ - ("(__WEBMAIL_ACCT + __MAILBOX_FULL + (__TVD_PH_SUBJ_META || __TVD_PH_BODY_META) > 3)", ""), - - ] { + ("A + B + C > 1", ""), + ("(A)", ""), + ("A && !(B || C)", ""), + ("!A && B && (C || D)", ""), + ("((A||B||C) && !D && !E)", ""), + ("(A + B + (C || D) > 3)", ""), + ( + "(A || B) > 2 && (C && D) == 0 || ((E+F-G) > 0 || (H||I) <= 4)", + "", + ), + ("(A || B) > (C && D) && E", ""), + //("", ""), + ] + .iter() + .enumerate() + { let meta = MetaExpression::from_meta(expr); //println!("{:#?}", meta.tokens); - let result = meta.to_string(); + /*if pos != 13 { + continue; + }*/ println!("{expr}"); - println!("{}", result); + //let tokens = Tokenizer::new(expr).collect::>(); + //println!("{tokens:?}"); + //let mut p = Parser::new(&tokens); + //let expr = p.parse().unwrap(); + + //println!("{:#?}", expr); + + println!("{}\n------------------------------------", meta); /*assert_eq!( result, diff --git a/crates/antispam/src/import/mod.rs b/crates/antispam/src/import/mod.rs index f71cdac0..a424ddca 100644 --- a/crates/antispam/src/import/mod.rs +++ b/crates/antispam/src/import/mod.rs @@ -1,9 +1,9 @@ use std::collections::HashMap; -use self::meta::MetaExpression; - +pub mod ast; pub mod meta; pub mod spamassassin; +pub mod tokenizer; pub mod utils; #[derive(Debug, Default, Clone)] @@ -49,6 +49,12 @@ enum RuleType { None, } +#[derive(Debug, Clone, Default)] +pub struct MetaExpression { + pub tokens: Vec, + pub expr: Expr, +} + impl RuleType { pub fn pattern(&mut self) -> Option<&mut String> { match self { @@ -116,10 +122,6 @@ pub enum Token { OpenParen, CloseParen, - - // Sieve specific - BeginExpression(bool), - EndExpression(bool), } #[derive(Debug, PartialEq, Eq, Clone)] @@ -143,11 +145,66 @@ pub enum Operation { Add, Multiply, Divide, + Subtract, And, Or, Not, } +#[derive(Debug, PartialEq, Clone)] +pub enum Expr { + UnaryOp(UnaryOperator, Box), + BinaryOp(Box, BinaryOperator, Box), + Literal(u32), + Identifier(String), +} + +impl Default for Expr { + fn default() -> Self { + Self::Literal(0) + } +} + +#[derive(Debug, PartialEq, Clone)] +pub enum UnaryOperator { + Not, + Minus, +} + +#[derive(Debug, PartialEq, Clone)] +pub enum BinaryOperator { + Or, + And, + Greater, + Lesser, + GreaterOrEqual, + LesserOrEqual, + Equal, + Add, + Subtract, + Multiply, + Divide, + BitwiseAnd, + BitwiseOr, +} + +impl BinaryOperator { + pub fn precedence(&self) -> u32 { + match self { + Self::Or => 1, + Self::And => 2, + Self::Greater + | Self::Lesser + | Self::GreaterOrEqual + | Self::LesserOrEqual + | Self::Equal => 3, + Self::Add | Self::Subtract => 4, + Self::Multiply | Self::Divide => 5, + Self::BitwiseAnd | Self::BitwiseOr => 6, + } + } +} + impl Rule { fn score(&self) -> f64 { self.scores.last().copied().unwrap_or_else(|| { diff --git a/crates/antispam/src/import/spamassassin.rs b/crates/antispam/src/import/spamassassin.rs index 706c75dd..edfdf47a 100644 --- a/crates/antispam/src/import/spamassassin.rs +++ b/crates/antispam/src/import/spamassassin.rs @@ -6,9 +6,9 @@ use std::{ }; use super::{ - meta::MetaExpression, utils::{fix_broken_regex, replace_tags}, - Header, HeaderMatches, HeaderPart, Rule, RuleType, TestFlag, Token, UnwrapResult, + Header, HeaderMatches, HeaderPart, MetaExpression, Rule, RuleType, TestFlag, Token, + UnwrapResult, }; const VERSION: f64 = 4.000000; @@ -1038,7 +1038,7 @@ pub fn import_spamassassin(path: PathBuf, extension: String, do_warn: bool, vali && !meta .tokens .iter() - .any(|t| matches!(&t.token, Token::Tag(n) if n == &rule.name))) + .any(|t| matches!(&t, Token::Tag(n) if n == &rule.name))) { continue; } diff --git a/crates/antispam/src/import/tokenizer.rs b/crates/antispam/src/import/tokenizer.rs new file mode 100644 index 00000000..a3c39859 --- /dev/null +++ b/crates/antispam/src/import/tokenizer.rs @@ -0,0 +1,181 @@ +use super::{Comparator, Logical, Operation, Token}; + +pub struct Tokenizer<'x> { + expr: &'x str, + iter: std::iter::Peekable>, + buf: String, + depth: u32, + comparator_depth: u32, + next_token: Option, +} + +impl<'x> Tokenizer<'x> { + pub fn new(expr: &'x str) -> Self { + Self { + expr, + iter: expr.chars().peekable(), + buf: String::new(), + depth: 0, + next_token: None, + comparator_depth: u32::MAX, + } + } +} + +impl<'x> Iterator for Tokenizer<'x> { + type Item = Token; + + fn next(&mut self) -> Option { + if let Some(token) = self.next_token.take() { + return Some(token); + } + + while let Some(ch) = self.iter.next() { + match ch { + 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' => { + self.buf.push(ch); + } + _ => { + let mut depth = self.depth; + let prev_token = if !self.buf.is_empty() { + Token::from(std::mem::take(&mut self.buf)).into() + } else { + None + }; + let token = match ch { + '&' | '|' => { + if matches!(self.iter.next(), Some(c) if c == ch) { + let is_and = ch == '&'; + if self.depth > self.comparator_depth { + Token::Operation(if is_and { + Operation::And + } else { + Operation::Or + }) + } else { + let mut depth = self.depth; + let mut found_comp = false; + + for ch in self.iter.clone() { + match ch { + '(' => depth += 1, + ')' => { + depth -= 1; + } + '<' | '>' | '=' => { + found_comp = true; + break; + } + _ => (), + } + } + + if found_comp && depth < self.depth { + self.comparator_depth = depth; + Token::Operation(if is_and { + Operation::And + } else { + Operation::Or + }) + } else { + self.comparator_depth = u32::MAX; + Token::Logical(if is_and { + Logical::And + } else { + Logical::Or + }) + } + } + } else { + eprintln!("Warning: Single {ch} in meta expression {}", self.expr); + return None; + } + } + '!' => Token::Logical(Logical::Not), + '=' => match self.iter.next() { + Some('=') => Token::Comparator(Comparator::Eq), + Some('>') => Token::Comparator(Comparator::Ge), + Some('<') => Token::Comparator(Comparator::Le), + _ => { + eprintln!("Warning: Single = in meta expression {}", self.expr); + Token::Comparator(Comparator::Eq) + } + }, + '>' => match self.iter.peek() { + Some('=') => { + self.iter.next(); + Token::Comparator(Comparator::Ge) + } + _ => Token::Comparator(Comparator::Gt), + }, + '<' => match self.iter.peek() { + Some('=') => { + self.iter.next(); + Token::Comparator(Comparator::Le) + } + _ => Token::Comparator(Comparator::Lt), + }, + '(' => { + self.depth += 1; + Token::OpenParen + } + ')' => { + if self.depth == 0 { + eprintln!( + "Warning: Unmatched close parenthesis in meta expression {}", + self.expr + ); + return None; + } + self.depth -= 1; + depth = self.depth; + + Token::CloseParen + } + '+' => Token::Operation(Operation::Add), + '*' => Token::Operation(Operation::Multiply), + '/' => Token::Operation(Operation::Divide), + '-' => Token::Operation(Operation::Subtract), + ' ' => { + if let Some(prev_token) = prev_token { + return Some(prev_token); + } else { + continue; + } + } + _ => { + eprintln!( + "Warning: Invalid character {ch} in meta expression {}", + self.expr + ); + return None; + } + }; + + if matches!(token, Token::Comparator(_)) { + self.comparator_depth = depth; + } + + return Some(if let Some(prev_token) = prev_token { + self.next_token = Some(token); + prev_token + } else { + token + }); + } + } + } + + if self.depth > 0 { + eprintln!( + "Warning: Unmatched open parenthesis in meta expression {}", + self.expr + ); + None + } else if !self.buf.is_empty() { + Some(Token::from(std::mem::take(&mut self.buf))) + } else { + None + } + } +}