diff --git a/Cargo.lock b/Cargo.lock index f86c208d..be5ff2fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -93,9 +93,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.1" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] @@ -2676,15 +2676,6 @@ version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" -[[package]] -name = "linkify" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1dfa36d52c581e9ec783a7ce2a5e0143da6237be5811a0b3153fedfdbe9f780" -dependencies = [ - "memchr", -] - [[package]] name = "linux-raw-sys" version = "0.4.10" @@ -2994,11 +2985,16 @@ dependencies = [ "farmhash", "jieba-rs", "lazy_static", + "lru-cache", "nohash", + "parking_lot", + "phf", "rust-stemmers", "serde", "siphasher 1.0.0", "tinysegmenter", + "tokio", + "utils", "whatlang", "xxhash-rust", ] @@ -3294,9 +3290,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "3.9.1" +version = "3.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a54938017eacd63036332b4ae5c8a49fc8c0c1d6d629893057e4f13609edd06" +checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" dependencies = [ "num-traits", ] @@ -3630,9 +3626,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.68" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c" +checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" dependencies = [ "unicode-ident", ] @@ -3799,9 +3795,9 @@ dependencies = [ [[package]] name = "rasn" -version = "0.10.1" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4addd1a49756bcb131c2f686c6c833d2b63e4da7a0df07efd8c3de04b7efbdb2" +checksum = "c22b7f7ff0508dae62e1be69fe02f32eb88523090b50ac850637947853cf5b6d" dependencies = [ "arrayvec", "bitvec", @@ -3821,9 +3817,9 @@ dependencies = [ [[package]] name = "rasn-cms" -version = "0.10.1" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e269b4df6eea0f54abd46afacd759b1c13a27e98da98a47ef3c405ef3568b0f5" +checksum = "6ecf9f1bb38cbb2a032014f0329d7fd9c2b08f26c4fc882ad642bb95dfefd74f" dependencies = [ "rasn", "rasn-pkix", @@ -3831,9 +3827,9 @@ dependencies = [ [[package]] name = "rasn-derive" -version = "0.10.1" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba8242a16e3461b81333516ad8457906f52fdf21d087417fb59262c9ab406618" +checksum = "a1e6ddbc9ada563036d59c322cb0886a9b08b346904eebbcd20af2e01caecee7" dependencies = [ "either", "itertools 0.10.5", @@ -3846,9 +3842,9 @@ dependencies = [ [[package]] name = "rasn-pkix" -version = "0.10.1" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06179c947a63fe9f9f5d73a539dcb13d90c6bdaeb03bd28b90ad796aff9fe6a8" +checksum = "b894c903130c4915d79d8d9ce155429b3896b25efa5f81de4d9ab7b1b0f0b7cf" dependencies = [ "rasn", ] @@ -3904,14 +3900,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.6" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff" +checksum = "d119d7c7ca818f8a53c300863d4f87566aac09943aef5b355bb83969dae75d87" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.3.9", - "regex-syntax 0.7.5", + "regex-automata 0.4.1", + "regex-syntax 0.8.0", ] [[package]] @@ -3925,13 +3921,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.9" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" +checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.5", + "regex-syntax 0.8.0", ] [[package]] @@ -3942,9 +3938,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.7.5" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +checksum = "c3cbb081b9784b07cceb8824c8583f86db4814d172ab043f3c23f7dc600bf83d" [[package]] name = "reqwest" @@ -4610,7 +4606,7 @@ checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" [[package]] name = "sieve-rs" version = "0.3.1" -source = "git+https://github.com/stalwartlabs/sieve#c9288b62815610872e9f278b904e34d46124acb5" +source = "git+https://github.com/stalwartlabs/sieve#bbb265765ebe92394e429001e90ba2e9b4201f9a" dependencies = [ "ahash 0.8.3", "bincode", @@ -4690,13 +4686,13 @@ dependencies = [ "imagesize", "infer", "lazy_static", - "linkify", "lru-cache", "mail-auth", "mail-builder", "mail-parser", "mail-send", "md5", + "nlp", "num_cpus", "parking_lot", "rand 0.8.5", diff --git a/crates/directory/src/cache/lookup.rs b/crates/directory/src/cache/lookup.rs index 2574d774..b4a682a8 100644 --- a/crates/directory/src/cache/lookup.rs +++ b/crates/directory/src/cache/lookup.rs @@ -23,7 +23,7 @@ use mail_send::Credentials; -use crate::{Directory, Principal, QueryColumn}; +use crate::{DatabaseColumn, Directory, Principal}; use super::CachedDirectory; @@ -71,11 +71,15 @@ impl Directory for CachedDirectory { self.inner.expn(address).await } - async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result { + async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result { self.inner.lookup(query, params).await } - async fn query(&self, query: &str, params: &[&str]) -> crate::Result> { + async fn query( + &self, + query: &str, + params: &[DatabaseColumn<'_>], + ) -> crate::Result>> { self.inner.query(query, params).await } diff --git a/crates/directory/src/imap/lookup.rs b/crates/directory/src/imap/lookup.rs index f526581a..21f2981d 100644 --- a/crates/directory/src/imap/lookup.rs +++ b/crates/directory/src/imap/lookup.rs @@ -24,7 +24,7 @@ use mail_send::Credentials; use smtp_proto::{AUTH_CRAM_MD5, AUTH_LOGIN, AUTH_OAUTHBEARER, AUTH_PLAIN, AUTH_XOAUTH2}; -use crate::{Directory, DirectoryError, Principal, QueryColumn}; +use crate::{DatabaseColumn, Directory, DirectoryError, Principal}; use super::{ImapDirectory, ImapError}; @@ -98,11 +98,15 @@ impl Directory for ImapDirectory { Err(DirectoryError::unsupported("imap", "expn")) } - async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result { + async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result { Err(DirectoryError::unsupported("imap", "lookup")) } - async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result> { + async fn query( + &self, + _: &str, + _: &[DatabaseColumn<'_>], + ) -> crate::Result>> { Err(DirectoryError::unsupported("imap", "query")) } diff --git a/crates/directory/src/ldap/lookup.rs b/crates/directory/src/ldap/lookup.rs index b3e5274f..db7c8112 100644 --- a/crates/directory/src/ldap/lookup.rs +++ b/crates/directory/src/ldap/lookup.rs @@ -24,7 +24,7 @@ use ldap3::{ResultEntry, Scope, SearchEntry}; use mail_send::Credentials; -use crate::{Directory, Principal, QueryColumn, Type}; +use crate::{DatabaseColumn, Directory, Principal, Type}; use super::{LdapDirectory, LdapMappings}; @@ -239,13 +239,17 @@ impl Directory for LdapDirectory { Ok(emails) } - async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result { + async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result { self.query_(query, params) .await .map(|entry| entry.is_some()) } - async fn query(&self, query: &str, params: &[&str]) -> crate::Result> { + async fn query( + &self, + query: &str, + params: &[DatabaseColumn<'_>], + ) -> crate::Result>> { self.query_(query, params).await.map(|entry| { if let Some(entry) = entry { let mut object = String::new(); @@ -257,7 +261,7 @@ impl Directory for LdapDirectory { object.push('\n'); } } - vec![QueryColumn::Text(object)] + vec![DatabaseColumn::Text(object.into())] } else { vec![] } @@ -283,7 +287,11 @@ impl Directory for LdapDirectory { } impl LdapDirectory { - async fn query_(&self, query: &str, params: &[&str]) -> crate::Result> { + async fn query_( + &self, + query: &str, + params: &[DatabaseColumn<'_>], + ) -> crate::Result> { let mut conn = self.pool.get().await?; tracing::trace!(context = "directory", event = "query", query = query, params = ?params); @@ -292,7 +300,7 @@ impl LdapDirectory { for (pos, item) in query.split('?').enumerate() { if pos > 0 { if let Some(param) = params.get(pos - 1) { - expanded_query.push_str(param); + expanded_query.push_str(param.as_str()); } } expanded_query.push_str(item); diff --git a/crates/directory/src/lib.rs b/crates/directory/src/lib.rs index 96853f16..191fe9ba 100644 --- a/crates/directory/src/lib.rs +++ b/crates/directory/src/lib.rs @@ -21,7 +21,11 @@ * for more details. */ -use std::{borrow::Cow, fmt::Debug, sync::Arc}; +use std::{ + borrow::Cow, + fmt::{Debug, Display}, + sync::Arc, +}; use ahash::{AHashMap, AHashSet}; use bb8::RunError; @@ -82,8 +86,12 @@ pub trait Directory: Sync + Send { async fn rcpt(&self, address: &str) -> crate::Result; async fn vrfy(&self, address: &str) -> Result>; async fn expn(&self, address: &str) -> Result>; - async fn lookup(&self, query: &str, params: &[&str]) -> Result; - async fn query(&self, query: &str, params: &[&str]) -> Result>; + async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> Result; + async fn query( + &self, + query: &str, + params: &[DatabaseColumn<'_>], + ) -> Result>>; fn type_name(&self) -> &'static str { std::any::type_name::() @@ -91,12 +99,12 @@ pub trait Directory: Sync + Send { } #[derive(Clone, Debug)] -pub enum QueryColumn { +pub enum DatabaseColumn<'x> { Integer(i64), Bool(bool), Float(f64), - Text(String), - Blob(Vec), + Text(Cow<'x, str>), + Blob(Cow<'x, [u8]>), Null, } @@ -169,24 +177,24 @@ impl PartialEq for MatchType { impl Eq for MatchType {} impl Lookup { - pub async fn contains(&self, item: &str) -> Option { + pub async fn contains(&self, item: impl Into>) -> Option { match self { Lookup::Directory { directory, query } => { - match directory.lookup(query, &[item]).await { + match directory.lookup(query, &[item.into()]).await { Ok(result) => result.into(), Err(_) => None, } } - Lookup::List { list } => list.contains(item).into(), - Lookup::Map { map } => map.contains_key(item).into(), + Lookup::List { list } => list.contains(item.into().as_str()).into(), + Lookup::Map { map } => map.contains_key(item.into().as_str()).into(), } } - pub async fn lookup(&self, item: &str) -> Option> { + pub async fn lookup(&self, items: &[DatabaseColumn<'_>]) -> Option> { match self { - Lookup::Directory { directory, query } => match directory.query(query, &[item]).await { + Lookup::Directory { directory, query } => match directory.query(query, items).await { Ok(mut result) => match result.len() { - 1 if !matches!(result.first(), Some(QueryColumn::Null)) => { + 1 if !matches!(result.first(), Some(DatabaseColumn::Null)) => { result.pop().map(Variable::from).unwrap() } 0 => Variable::default(), @@ -195,21 +203,34 @@ impl Lookup { .into(), Err(_) => None, }, - Lookup::List { list } => Some(list.contains(item).into()), - Lookup::Map { map } => map.get(item).cloned(), + Lookup::List { list } => Some(list.contains(items[0].as_str()).into()), + Lookup::Map { map } => map.get(items[0].as_str()).cloned(), + } + } + + pub async fn query( + &self, + items: &[DatabaseColumn<'_>], + ) -> Option>> { + match self { + Lookup::Directory { directory, query } => match directory.query(query, items).await { + Ok(result) => Some(result), + Err(_) => None, + }, + _ => None, } } } -impl From for Variable<'static> { - fn from(value: QueryColumn) -> Self { +impl<'x> From> for Variable<'static> { + fn from(value: DatabaseColumn) -> Self { match value { - QueryColumn::Integer(v) => Variable::Integer(v), - QueryColumn::Bool(v) => Variable::Integer(i64::from(v)), - QueryColumn::Float(v) => Variable::Float(v), - QueryColumn::Text(v) => Variable::String(v), - QueryColumn::Blob(v) => Variable::String(v.into_string()), - QueryColumn::Null => Variable::StringRef(""), + DatabaseColumn::Integer(v) => Variable::Integer(v), + DatabaseColumn::Bool(v) => Variable::Integer(i64::from(v)), + DatabaseColumn::Float(v) => Variable::Float(v), + DatabaseColumn::Text(v) => Variable::String(v.into_owned()), + DatabaseColumn::Blob(v) => Variable::String(v.into_owned().into_string()), + DatabaseColumn::Null => Variable::StringRef(""), } } } @@ -457,3 +478,115 @@ impl AddressMapping { } } } + +impl<'x> DatabaseColumn<'x> { + pub fn as_str(&self) -> &str { + match self { + Self::Text(v) => v.as_ref(), + _ => "", + } + } +} + +impl<'x> From<&'x str> for DatabaseColumn<'x> { + fn from(value: &'x str) -> Self { + Self::Text(value.into()) + } +} + +impl<'x> From for DatabaseColumn<'x> { + fn from(value: String) -> Self { + Self::Text(value.into()) + } +} + +impl<'x> From<&'x String> for DatabaseColumn<'x> { + fn from(value: &'x String) -> Self { + Self::Text(value.into()) + } +} + +impl<'x> From> for DatabaseColumn<'x> { + fn from(value: Cow<'x, str>) -> Self { + Self::Text(value) + } +} + +impl<'x> From for DatabaseColumn<'x> { + fn from(value: bool) -> Self { + Self::Bool(value) + } +} + +impl<'x> From for DatabaseColumn<'x> { + fn from(value: i64) -> Self { + Self::Integer(value) + } +} + +impl<'x> From for DatabaseColumn<'x> { + fn from(value: u64) -> Self { + Self::Integer(value as i64) + } +} + +impl<'x> From for DatabaseColumn<'x> { + fn from(value: u32) -> Self { + Self::Integer(value as i64) + } +} + +impl<'x> From for DatabaseColumn<'x> { + fn from(value: f64) -> Self { + Self::Float(value) + } +} + +impl<'x> From<&'x [u8]> for DatabaseColumn<'x> { + fn from(value: &'x [u8]) -> Self { + Self::Blob(value.into()) + } +} + +impl<'x> From> for DatabaseColumn<'x> { + fn from(value: Vec) -> Self { + Self::Blob(value.into()) + } +} + +impl<'x> From> for DatabaseColumn<'x> { + fn from(value: Variable<'x>) -> Self { + match value { + Variable::String(v) => Self::Text(v.into()), + Variable::StringRef(v) => Self::Text(v.into()), + Variable::Integer(v) => Self::Integer(v), + Variable::Float(v) => Self::Float(v), + v => Self::Text(v.into_string().into()), + } + } +} + +impl<'x> From<&'x Variable<'x>> for DatabaseColumn<'x> { + fn from(value: &'x Variable<'x>) -> Self { + match value { + Variable::String(v) => Self::Text(v.into()), + Variable::StringRef(v) => Self::Text((*v).into()), + Variable::Integer(v) => Self::Integer(*v), + Variable::Float(v) => Self::Float(*v), + v => Self::Text(v.to_string().into()), + } + } +} + +impl<'x> Display for DatabaseColumn<'x> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DatabaseColumn::Text(v) => f.write_str(v.as_ref()), + DatabaseColumn::Integer(v) => write!(f, "{}", v), + DatabaseColumn::Bool(v) => write!(f, "{}", v), + DatabaseColumn::Float(v) => write!(f, "{}", v), + DatabaseColumn::Blob(v) => write!(f, "{}", String::from_utf8_lossy(v.as_ref())), + DatabaseColumn::Null => write!(f, "NULL"), + } + } +} diff --git a/crates/directory/src/memory/lookup.rs b/crates/directory/src/memory/lookup.rs index 950bddf8..0cc0aa41 100644 --- a/crates/directory/src/memory/lookup.rs +++ b/crates/directory/src/memory/lookup.rs @@ -23,7 +23,7 @@ use mail_send::Credentials; -use crate::{Directory, DirectoryError, Principal, QueryColumn}; +use crate::{DatabaseColumn, Directory, DirectoryError, Principal}; use super::{EmailType, MemoryDirectory}; @@ -132,11 +132,15 @@ impl Directory for MemoryDirectory { Ok(result) } - async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result { + async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result { Err(DirectoryError::unsupported("memory", "lookp")) } - async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result> { + async fn query( + &self, + _: &str, + _: &[DatabaseColumn<'_>], + ) -> crate::Result>> { Err(DirectoryError::unsupported("memory", "query")) } diff --git a/crates/directory/src/smtp/lookup.rs b/crates/directory/src/smtp/lookup.rs index 90eec63c..a0ef2fda 100644 --- a/crates/directory/src/smtp/lookup.rs +++ b/crates/directory/src/smtp/lookup.rs @@ -24,7 +24,7 @@ use mail_send::{smtp::AssertReply, Credentials}; use smtp_proto::Severity; -use crate::{Directory, DirectoryError, Principal, QueryColumn}; +use crate::{DatabaseColumn, Directory, DirectoryError, Principal}; use super::{SmtpClient, SmtpDirectory}; @@ -93,11 +93,15 @@ impl Directory for SmtpDirectory { .await } - async fn lookup(&self, _query: &str, _params: &[&str]) -> crate::Result { + async fn lookup(&self, _: &str, _: &[DatabaseColumn<'_>]) -> crate::Result { Err(DirectoryError::unsupported("smtp", "lookup")) } - async fn query(&self, _query: &str, _params: &[&str]) -> crate::Result> { + async fn query( + &self, + _: &str, + _: &[DatabaseColumn<'_>], + ) -> crate::Result>> { Err(DirectoryError::unsupported("smtp", "query")) } diff --git a/crates/directory/src/sql/lookup.rs b/crates/directory/src/sql/lookup.rs index 0aaa50be..866c11cd 100644 --- a/crates/directory/src/sql/lookup.rs +++ b/crates/directory/src/sql/lookup.rs @@ -25,7 +25,7 @@ use futures::TryStreamExt; use mail_send::Credentials; use sqlx::{any::AnyRow, postgres::any::AnyTypeInfoKind, Column, Row}; -use crate::{Directory, Principal, QueryColumn, Type}; +use crate::{DatabaseColumn, Directory, Principal, Type}; use super::{SqlDirectory, SqlMappings}; @@ -154,35 +154,39 @@ impl Directory for SqlDirectory { .map_err(Into::into) } - async fn lookup(&self, query: &str, params: &[&str]) -> crate::Result { + async fn lookup(&self, query: &str, params: &[DatabaseColumn<'_>]) -> crate::Result { self.query_(query, params).await.map(|row| row.is_some()) } - async fn query(&self, query: &str, params: &[&str]) -> crate::Result> { + async fn query( + &self, + query: &str, + params: &[DatabaseColumn<'_>], + ) -> crate::Result>> { self.query_(query, params).await.map(|row| { if let Some(row) = row { let mut columns = Vec::with_capacity(row.columns().len()); for col in row.columns() { let idx = col.ordinal(); columns.push(match col.type_info().kind() { - AnyTypeInfoKind::Null => QueryColumn::Null, + AnyTypeInfoKind::Null => DatabaseColumn::Null, AnyTypeInfoKind::Bool => { - QueryColumn::Bool(row.try_get(idx).unwrap_or_default()) + DatabaseColumn::Bool(row.try_get(idx).unwrap_or_default()) } AnyTypeInfoKind::SmallInt | AnyTypeInfoKind::Integer | AnyTypeInfoKind::BigInt => { - QueryColumn::Integer(row.try_get(idx).unwrap_or_default()) + DatabaseColumn::Integer(row.try_get(idx).unwrap_or_default()) } AnyTypeInfoKind::Real | AnyTypeInfoKind::Double => { - QueryColumn::Float(row.try_get(idx).unwrap_or_default()) - } - AnyTypeInfoKind::Text => { - QueryColumn::Text(row.try_get(idx).unwrap_or_default()) - } - AnyTypeInfoKind::Blob => { - QueryColumn::Blob(row.try_get(idx).unwrap_or_default()) + DatabaseColumn::Float(row.try_get(idx).unwrap_or_default()) } + AnyTypeInfoKind::Text => DatabaseColumn::Text( + row.try_get::(idx).unwrap_or_default().into(), + ), + AnyTypeInfoKind::Blob => DatabaseColumn::Blob( + row.try_get::, _>(idx).unwrap_or_default().into(), + ), }); } columns @@ -204,11 +208,24 @@ impl Directory for SqlDirectory { } impl SqlDirectory { - async fn query_(&self, query: &str, params: &[&str]) -> crate::Result> { + async fn query_( + &self, + query: &str, + params: &[DatabaseColumn<'_>], + ) -> crate::Result> { tracing::trace!(context = "directory", event = "query", query = query, params = ?params); let mut q = sqlx::query(query); for param in params { - q = q.bind(param); + q = match param { + DatabaseColumn::Text(v) => q.bind(v.as_ref()), + DatabaseColumn::Integer(v) => q.bind(v), + DatabaseColumn::Bool(v) => q.bind(v), + DatabaseColumn::Float(v) => q.bind(v), + DatabaseColumn::Blob(v) => { + q.bind(std::str::from_utf8(v.as_ref()).unwrap_or_default()) + } + DatabaseColumn::Null => q.bind(""), + } } q.fetch(&self.pool).try_next().await.map_err(Into::into) diff --git a/crates/jmap/Cargo.toml b/crates/jmap/Cargo.toml index fa1de123..dc98cffe 100644 --- a/crates/jmap/Cargo.toml +++ b/crates/jmap/Cargo.toml @@ -37,8 +37,8 @@ p256 = { version = "0.13", features = ["ecdh"] } hkdf = "0.12.3" sha2 = "0.10.1" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls-webpki-roots"]} -tokio-tungstenite = "0.20.0" -tungstenite = "0.20.0" +tokio-tungstenite = "0.20" +tungstenite = "0.20" chrono = "0.4" dashmap = "5.4" aes = "0.8.3" diff --git a/crates/nlp/Cargo.toml b/crates/nlp/Cargo.toml index 9db50841..0304da94 100644 --- a/crates/nlp/Cargo.toml +++ b/crates/nlp/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" resolver = "2" [dependencies] +utils = { path = "../utils" } xxhash-rust = { version = "0.8.5", features = ["xxh3"] } farmhash = "1.1.5" siphasher = "1.0" @@ -17,3 +18,12 @@ whatlang = "0.16" # Language detection rust-stemmers = "1.2" # Stemmers tinysegmenter = "0.1" # Japanese tokenizer jieba-rs = "0.6" # Chinese stemmer +phf = { version = "0.11", features = ["macros"] } +lru-cache = "0.1.2" +parking_lot = "0.12.1" + +[features] +test_mode = [] + +[dev-dependencies] +tokio = { version = "1.23", features = ["full"] } diff --git a/crates/nlp/src/bayes/bloom.rs b/crates/nlp/src/bayes/bloom.rs deleted file mode 100644 index e701bcd6..00000000 --- a/crates/nlp/src/bayes/bloom.rs +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2023 Stalwart Labs Ltd. - * - * This file is part of the Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use nohash::IsEnabled; - -use crate::transformers::osb::{Gram, OsbToken}; - -use super::TokenHash; - -pub struct BloomHasher<'x, T: Iterator>>> { - buf: Vec, - tokens: T, -} - -impl<'x, T: Iterator>>> BloomHasher<'x, T> { - pub fn new(tokens: T) -> Self { - Self { - buf: Vec::with_capacity(64), - tokens, - } - } -} - -impl<'x, T: Iterator>>> Iterator for BloomHasher<'x, T> { - type Item = OsbToken; - - fn next(&mut self) -> Option { - self.tokens.next().map(|token| { - let bytes = match token.inner { - Gram::Uni { t1 } => t1.as_bytes(), - Gram::Bi { t1, t2, .. } => { - self.buf.clear(); - self.buf.extend_from_slice(t1.as_bytes()); - self.buf.push(b' '); - self.buf.extend_from_slice(t2.as_bytes()); - &self.buf - } - }; - - OsbToken { - inner: TokenHash { - h1: xxhash_rust::xxh3::xxh3_64(bytes), - h2: farmhash::hash64(bytes), - }, - idx: token.idx, - } - }) - } -} - -impl std::hash::Hash for TokenHash { - fn hash(&self, state: &mut H) { - state.write_u64(self.h1 ^ self.h2); - } -} - -impl IsEnabled for TokenHash {} diff --git a/crates/nlp/src/bayes/cache.rs b/crates/nlp/src/bayes/cache.rs new file mode 100644 index 00000000..6645ec85 --- /dev/null +++ b/crates/nlp/src/bayes/cache.rs @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::{ + hash::BuildHasherDefault, + time::{Duration, Instant}, +}; + +use lru_cache::LruCache; +use nohash::NoHashHasher; +use parking_lot::Mutex; + +use super::{TokenHash, Weights}; + +#[derive(Debug)] +pub struct BayesTokenCache { + positive: Mutex>>>, + negative: Mutex>>>, + ttl_negative: Duration, + ttl_positive: Duration, +} + +#[derive(Debug, Clone)] +pub struct CacheItem { + item: Weights, + valid_until: Instant, +} + +impl BayesTokenCache { + pub fn new(capacity: usize, ttl_positive: Duration, ttl_negative: Duration) -> Self { + Self { + positive: Mutex::new(LruCache::with_hasher(capacity, Default::default())), + negative: Mutex::new(LruCache::with_hasher(capacity, Default::default())), + ttl_negative, + ttl_positive, + } + } + + pub fn get(&self, hash: &TokenHash) -> Option> { + { + let mut pos_cache = self.positive.lock(); + if let Some(entry) = pos_cache.get_mut(hash) { + return if entry.valid_until >= Instant::now() { + Some(Some(entry.item)) + } else { + pos_cache.remove(hash); + None + }; + } + } + { + let mut neg_cache = self.negative.lock(); + if let Some(entry) = neg_cache.get_mut(hash) { + return if *entry >= Instant::now() { + Some(None) + } else { + neg_cache.remove(hash); + None + }; + } + } + + None + } + + pub fn insert_positive(&self, hash: TokenHash, weights: Weights) { + self.positive.lock().insert( + hash, + CacheItem { + item: weights, + valid_until: Instant::now() + self.ttl_positive, + }, + ); + } + + pub fn insert_negative(&self, hash: TokenHash) { + self.negative + .lock() + .insert(hash, Instant::now() + self.ttl_negative); + } + + pub fn invalidate(&self, hash: &TokenHash) { + if self.positive.lock().remove(hash).is_none() { + self.negative.lock().remove(hash); + } + } +} diff --git a/crates/nlp/src/bayes/classify.rs b/crates/nlp/src/bayes/classify.rs index 38f5da85..a2b36b2a 100644 --- a/crates/nlp/src/bayes/classify.rs +++ b/crates/nlp/src/bayes/classify.rs @@ -21,13 +21,14 @@ * for more details. */ -use crate::transformers::osb::OsbToken; +use crate::tokenizers::osb::OsbToken; use super::{BayesClassifier, Weights}; // Position 0 represents Unigram weights const FEATURE_WEIGHT: [f64; 8] = [1.0, 3125.0, 256.0, 27.0, 1.0, 0.0, 0.0, 0.0]; +// Credits: ported from RSpamd impl BayesClassifier { pub fn classify(&self, tokens: T, ham_learns: u32, spam_learns: u32) -> Option where diff --git a/crates/nlp/src/bayes/mod.rs b/crates/nlp/src/bayes/mod.rs index 3fb419d2..99a38bcf 100644 --- a/crates/nlp/src/bayes/mod.rs +++ b/crates/nlp/src/bayes/mod.rs @@ -26,8 +26,11 @@ use std::{collections::HashMap, hash::BuildHasherDefault}; use nohash::NoHashHasher; use serde::{Deserialize, Serialize}; -pub mod bloom; +use crate::tokenizers::osb::Gram; + +pub mod cache; pub mod classify; +pub mod tokenize; pub mod train; #[derive(Debug, Serialize, Deserialize, Default)] @@ -37,7 +40,7 @@ pub struct BayesModel { pub ham_learns: u32, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct BayesClassifier { pub min_token_hits: u32, pub min_tokens: u32, @@ -47,14 +50,14 @@ pub struct BayesClassifier { #[derive(Debug, Serialize, Deserialize, Default, Copy, Clone, PartialEq, Eq)] pub struct TokenHash { - h1: u64, - h2: u64, + pub h1: u64, + pub h2: u64, } #[derive(Debug, Serialize, Deserialize, Default, Copy, Clone)] pub struct Weights { - spam: u32, - ham: u32, + pub spam: u32, + pub ham: u32, } impl BayesClassifier { @@ -73,3 +76,32 @@ impl Default for BayesClassifier { Self::new() } } + +impl From> for TokenHash { + fn from(value: Gram<'_>) -> Self { + match value { + Gram::Uni { t1 } => TokenHash { + h1: xxhash_rust::xxh3::xxh3_64(t1.as_bytes()), + h2: farmhash::hash64(t1.as_bytes()), + }, + Gram::Bi { t1, t2, .. } => { + let mut buf = Vec::with_capacity(t1.len() + t2.len() + 1); + buf.extend_from_slice(t1.as_bytes()); + buf.push(b' '); + buf.extend_from_slice(t2.as_bytes()); + TokenHash { + h1: xxhash_rust::xxh3::xxh3_64(&buf), + h2: farmhash::hash64(&buf), + } + } + } + } +} + +impl std::hash::Hash for TokenHash { + fn hash(&self, state: &mut H) { + state.write_u64(self.h1 ^ self.h2); + } +} + +impl nohash::IsEnabled for TokenHash {} diff --git a/crates/nlp/src/bayes/tokenize.rs b/crates/nlp/src/bayes/tokenize.rs new file mode 100644 index 00000000..ebc44444 --- /dev/null +++ b/crates/nlp/src/bayes/tokenize.rs @@ -0,0 +1,1227 @@ +use std::borrow::Cow; + +use utils::suffixlist::PublicSuffix; + +use crate::{ + language::{ + detect::{LanguageDetector, MIN_LANGUAGE_SCORE}, + stemmer::STEMMER_MAP, + stopwords::STOP_WORDS, + Language, + }, + tokenizers::{ + chinese::JIEBA, + types::{TokenType, TypesTokenizer}, + }, +}; + +pub struct BayesTokenizer<'x, 'y> { + text: &'x str, + tokenizer: TypesTokenizer<'x, 'y>, + stemmer: Stemmer, + stop_words: Option<&'static phf::Set<&'static str>>, + tokens: Vec>, +} + +enum Stemmer { + IndoEuropean(rust_stemmers::Stemmer), + Mandarin, + Japanese, + None, +} + +impl<'x, 'y> BayesTokenizer<'x, 'y> { + pub fn new(text: &'x str, suffixes: &'y PublicSuffix) -> Self { + // Detect language + let (mut language, score) = + LanguageDetector::detect_single(text).unwrap_or((Language::English, 1.0)); + if score < MIN_LANGUAGE_SCORE { + language = Language::English; + } + + Self { + text, + tokenizer: TypesTokenizer::new(text, suffixes), + stemmer: match language { + Language::Mandarin => Stemmer::Mandarin, + Language::Japanese => Stemmer::Japanese, + _ => STEMMER_MAP[language as usize] + .map(|algo| Stemmer::IndoEuropean(rust_stemmers::Stemmer::create(algo))) + .unwrap_or(Stemmer::None), + }, + stop_words: STOP_WORDS[language as usize], + tokens: vec![], + } + } +} + +impl<'x, 'y> Iterator for BayesTokenizer<'x, 'y> { + type Item = Cow<'x, str>; + + fn next(&mut self) -> Option { + if let Some(prev_token) = self.tokens.pop() { + return Some(prev_token); + } + + loop { + let token = self.tokenizer.next()?; + + let word: Cow = match token.word { + TokenType::Alphabetic(word) | TokenType::Hexadecimal(word) => { + let word = word.to_lowercase(); + if self + .stop_words + .map_or(false, |sw| sw.contains(word.as_str())) + { + continue; + } + match &self.stemmer { + Stemmer::IndoEuropean(stemmer) => match stemmer.stem(&word) { + Cow::Borrowed(_) => word.into(), + Cow::Owned(stemmed_word) => stemmed_word.into(), + }, + Stemmer::Mandarin => { + let mut result = JIEBA.cut(&word, false).into_iter(); + if let Some(stemmed_word) = result.next() { + let stemmed_word = stemmed_word.to_string(); + self.tokens = result + .rev() + .map(|word| Cow::from(word.to_string())) + .collect::>(); + stemmed_word.into() + } else { + // This shouldn't happen, but just in case + continue; + } + } + Stemmer::Japanese => { + let mut result = tinysegmenter::tokenize(&word).into_iter(); + if let Some(stemmed_word) = result.next() { + self.tokens = result.rev().map(Cow::from).collect::>(); + stemmed_word.into() + } else { + // This shouldn't happen, but just in case + continue; + } + } + Stemmer::None => word.into(), + } + } + + TokenType::Url(word) => { + if let Some((_, host)) = word.split_once("://") { + host.split_once('/') + .map_or(host, |(h, _)| h) + .to_lowercase() + .into() + } else { + continue; + } + } + TokenType::UrlNoScheme(word) => word + .split_once('/') + .map_or(word, |(h, _)| h) + .to_lowercase() + .into(), + TokenType::Alphanumeric(word) + | TokenType::Email(word) + | TokenType::UrlNoHost(word) => word.to_lowercase().into(), + TokenType::Other(ch) => { + if SYMBOLS.contains(&ch) { + (&self.text[token.from..token.to]).into() + } else { + continue; + } + } + TokenType::Integer(word) | TokenType::Float(word) => word.into(), + TokenType::Punctuation(_) | TokenType::Space => { + continue; + } + }; + + return Some(word); + } + } +} + +pub static SYMBOLS: phf::Set = phf::phf_set! { + // Currency + '\u{0024}', '\u{00A2}', '\u{00A3}', '\u{00A4}', '\u{00A5}', '\u{058F}', '\u{060B}', '\u{07FE}', + '\u{07FF}', '\u{09F2}', '\u{09F3}', '\u{09FB}', '\u{0AF1}', '\u{0BF9}', '\u{0E3F}', '\u{17DB}', + '\u{20A0}', '\u{20A1}', '\u{20A2}', '\u{20A3}', '\u{20A4}', '\u{20A5}', '\u{20A6}', '\u{20A7}', + '\u{20A8}', '\u{20A9}', '\u{20AA}', '\u{20AB}', '\u{20AC}', '\u{20AD}', '\u{20AE}', '\u{20AF}', + '\u{20B0}', '\u{20B1}', '\u{20B2}', '\u{20B3}', '\u{20B4}', '\u{20B5}', '\u{20B6}', '\u{20B7}', + '\u{20B8}', '\u{20B9}', '\u{20BA}', '\u{20BB}', '\u{20BC}', '\u{20BD}', '\u{20BE}', '\u{20BF}', + '\u{20C0}', '\u{A838}', '\u{FDFC}', '\u{FE69}', '\u{FF04}', '\u{FFE0}', '\u{FFE1}', '\u{FFE5}', + '\u{FFE6}', '\u{11FDD}', '\u{11FDE}', '\u{11FDF}', '\u{11FE0}', '\u{1E2FF}', '\u{1ECB0}', + + // Math + '\u{002B}', '\u{003C}', '\u{003D}', '\u{003E}', '\u{007C}', '\u{007E}', '\u{00AC}', '\u{00B1}', + '\u{00D7}', '\u{00F7}', '\u{03F6}', '\u{0606}', '\u{0607}', '\u{0608}', '\u{2044}', '\u{2052}', + '\u{207A}', '\u{207B}', '\u{207C}', '\u{208A}', '\u{208B}', '\u{208C}', '\u{2118}', '\u{2140}', + '\u{2141}', '\u{2142}', '\u{2143}', '\u{2144}', '\u{214B}', '\u{2190}', '\u{2191}', '\u{2192}', + '\u{2193}', '\u{2194}', '\u{219A}', '\u{219B}', '\u{21A0}', '\u{21A3}', '\u{21A6}', '\u{21AE}', + '\u{21CE}', '\u{21CF}', '\u{21D2}', '\u{21D4}', '\u{21F4}', '\u{21F5}', '\u{21F6}', '\u{21F7}', + '\u{21F8}', '\u{21F9}', '\u{21FA}', '\u{21FB}', '\u{21FC}', '\u{21FD}', '\u{21FE}', '\u{21FF}', + '\u{2200}', '\u{2201}', '\u{2202}', '\u{2203}', '\u{2204}', '\u{2205}', '\u{2206}', '\u{2207}', + '\u{2208}', '\u{2209}', '\u{220A}', '\u{220B}', '\u{220C}', '\u{220D}', '\u{220E}', '\u{220F}', + '\u{2210}', '\u{2211}', '\u{2212}', '\u{2213}', '\u{2214}', '\u{2215}', '\u{2216}', '\u{2217}', + '\u{2218}', '\u{2219}', '\u{221A}', '\u{221B}', '\u{221C}', '\u{221D}', '\u{221E}', '\u{221F}', + '\u{2220}', '\u{2221}', '\u{2222}', '\u{2223}', '\u{2224}', '\u{2225}', '\u{2226}', '\u{2227}', + '\u{2228}', '\u{2229}', '\u{222A}', '\u{222B}', '\u{222C}', '\u{222D}', '\u{222E}', '\u{222F}', + '\u{2230}', '\u{2231}', '\u{2232}', '\u{2233}', '\u{2234}', '\u{2235}', '\u{2236}', '\u{2237}', + '\u{2238}', '\u{2239}', '\u{223A}', '\u{223B}', '\u{223C}', '\u{223D}', '\u{223E}', '\u{223F}', + '\u{2240}', '\u{2241}', '\u{2242}', '\u{2243}', '\u{2244}', '\u{2245}', '\u{2246}', '\u{2247}', + '\u{2248}', '\u{2249}', '\u{224A}', '\u{224B}', '\u{224C}', '\u{224D}', '\u{224E}', '\u{224F}', + '\u{2250}', '\u{2251}', '\u{2252}', '\u{2253}', '\u{2254}', '\u{2255}', '\u{2256}', '\u{2257}', + '\u{2258}', '\u{2259}', '\u{225A}', '\u{225B}', '\u{225C}', '\u{225D}', '\u{225E}', '\u{225F}', + '\u{2260}', '\u{2261}', '\u{2262}', '\u{2263}', '\u{2264}', '\u{2265}', '\u{2266}', '\u{2267}', + '\u{2268}', '\u{2269}', '\u{226A}', '\u{226B}', '\u{226C}', '\u{226D}', '\u{226E}', '\u{226F}', + '\u{2270}', '\u{2271}', '\u{2272}', '\u{2273}', '\u{2274}', '\u{2275}', '\u{2276}', '\u{2277}', + '\u{2278}', '\u{2279}', '\u{227A}', '\u{227B}', '\u{227C}', '\u{227D}', '\u{227E}', '\u{227F}', + '\u{2280}', '\u{2281}', '\u{2282}', '\u{2283}', '\u{2284}', '\u{2285}', '\u{2286}', '\u{2287}', + '\u{2288}', '\u{2289}', '\u{228A}', '\u{228B}', '\u{228C}', '\u{228D}', '\u{228E}', '\u{228F}', + '\u{2290}', '\u{2291}', '\u{2292}', '\u{2293}', '\u{2294}', '\u{2295}', '\u{2296}', '\u{2297}', + '\u{2298}', '\u{2299}', '\u{229A}', '\u{229B}', '\u{229C}', '\u{229D}', '\u{229E}', '\u{229F}', + '\u{22A0}', '\u{22A1}', '\u{22A2}', '\u{22A3}', '\u{22A4}', '\u{22A5}', '\u{22A6}', '\u{22A7}', + '\u{22A8}', '\u{22A9}', '\u{22AA}', '\u{22AB}', '\u{22AC}', '\u{22AD}', '\u{22AE}', '\u{22AF}', + '\u{22B0}', '\u{22B1}', '\u{22B2}', '\u{22B3}', '\u{22B4}', '\u{22B5}', '\u{22B6}', '\u{22B7}', + '\u{22B8}', '\u{22B9}', '\u{22BA}', '\u{22BB}', '\u{22BC}', '\u{22BD}', '\u{22BE}', '\u{22BF}', + '\u{22C0}', '\u{22C1}', '\u{22C2}', '\u{22C3}', '\u{22C4}', '\u{22C5}', '\u{22C6}', '\u{22C7}', + '\u{22C8}', '\u{22C9}', '\u{22CA}', '\u{22CB}', '\u{22CC}', '\u{22CD}', '\u{22CE}', '\u{22CF}', + '\u{22D0}', '\u{22D1}', '\u{22D2}', '\u{22D3}', '\u{22D4}', '\u{22D5}', '\u{22D6}', '\u{22D7}', + '\u{22D8}', '\u{22D9}', '\u{22DA}', '\u{22DB}', '\u{22DC}', '\u{22DD}', '\u{22DE}', '\u{22DF}', + '\u{22E0}', '\u{22E1}', '\u{22E2}', '\u{22E3}', '\u{22E4}', '\u{22E5}', '\u{22E6}', '\u{22E7}', + '\u{22E8}', '\u{22E9}', '\u{22EA}', '\u{22EB}', '\u{22EC}', '\u{22ED}', '\u{22EE}', '\u{22EF}', + '\u{22F0}', '\u{22F1}', '\u{22F2}', '\u{22F3}', '\u{22F4}', '\u{22F5}', '\u{22F6}', '\u{22F7}', + '\u{22F8}', '\u{22F9}', '\u{22FA}', '\u{22FB}', '\u{22FC}', '\u{22FD}', '\u{22FE}', '\u{22FF}', + '\u{2320}', '\u{2321}', '\u{237C}', '\u{239B}', '\u{239C}', '\u{239D}', '\u{239E}', '\u{239F}', + '\u{23A0}', '\u{23A1}', '\u{23A2}', '\u{23A3}', '\u{23A4}', '\u{23A5}', '\u{23A6}', '\u{23A7}', + '\u{23A8}', '\u{23A9}', '\u{23AA}', '\u{23AB}', '\u{23AC}', '\u{23AD}', '\u{23AE}', '\u{23AF}', + '\u{23B0}', '\u{23B1}', '\u{23B2}', '\u{23B3}', '\u{23DC}', '\u{23DD}', '\u{23DE}', '\u{23DF}', + '\u{23E0}', '\u{23E1}', '\u{25B7}', '\u{25C1}', '\u{25F8}', '\u{25F9}', '\u{25FA}', '\u{25FB}', + '\u{25FC}', '\u{25FD}', '\u{25FE}', '\u{25FF}', '\u{266F}', '\u{27C0}', '\u{27C1}', '\u{27C2}', + '\u{27C3}', '\u{27C4}', '\u{27C7}', '\u{27C8}', '\u{27C9}', '\u{27CA}', '\u{27CB}', '\u{27CC}', + '\u{27CD}', '\u{27CE}', '\u{27CF}', '\u{27D0}', '\u{27D1}', '\u{27D2}', '\u{27D3}', '\u{27D4}', + '\u{27D5}', '\u{27D6}', '\u{27D7}', '\u{27D8}', '\u{27D9}', '\u{27DA}', '\u{27DB}', '\u{27DC}', + '\u{27DD}', '\u{27DE}', '\u{27DF}', '\u{27E0}', '\u{27E1}', '\u{27E2}', '\u{27E3}', '\u{27E4}', + '\u{27E5}', '\u{27F0}', '\u{27F1}', '\u{27F2}', '\u{27F3}', '\u{27F4}', '\u{27F5}', '\u{27F6}', + '\u{27F7}', '\u{27F8}', '\u{27F9}', '\u{27FA}', '\u{27FB}', '\u{27FC}', '\u{27FD}', '\u{27FE}', + '\u{27FF}', '\u{2900}', '\u{2901}', '\u{2902}', '\u{2903}', '\u{2904}', '\u{2905}', '\u{2906}', + '\u{2907}', '\u{2908}', '\u{2909}', '\u{290A}', '\u{290B}', '\u{290C}', '\u{290D}', '\u{290E}', + '\u{290F}', '\u{2910}', '\u{2911}', '\u{2912}', '\u{2913}', '\u{2914}', '\u{2915}', '\u{2916}', + '\u{2917}', '\u{2918}', '\u{2919}', '\u{291A}', '\u{291B}', '\u{291C}', '\u{291D}', '\u{291E}', + '\u{291F}', '\u{2920}', '\u{2921}', '\u{2922}', '\u{2923}', '\u{2924}', '\u{2925}', '\u{2926}', + '\u{2927}', '\u{2928}', '\u{2929}', '\u{292A}', '\u{292B}', '\u{292C}', '\u{292D}', '\u{292E}', + '\u{292F}', '\u{2930}', '\u{2931}', '\u{2932}', '\u{2933}', '\u{2934}', '\u{2935}', '\u{2936}', + '\u{2937}', '\u{2938}', '\u{2939}', '\u{293A}', '\u{293B}', '\u{293C}', '\u{293D}', '\u{293E}', + '\u{293F}', '\u{2940}', '\u{2941}', '\u{2942}', '\u{2943}', '\u{2944}', '\u{2945}', '\u{2946}', + '\u{2947}', '\u{2948}', '\u{2949}', '\u{294A}', '\u{294B}', '\u{294C}', '\u{294D}', '\u{294E}', + '\u{294F}', '\u{2950}', '\u{2951}', '\u{2952}', '\u{2953}', '\u{2954}', '\u{2955}', '\u{2956}', + '\u{2957}', '\u{2958}', '\u{2959}', '\u{295A}', '\u{295B}', '\u{295C}', '\u{295D}', '\u{295E}', + '\u{295F}', '\u{2960}', '\u{2961}', '\u{2962}', '\u{2963}', '\u{2964}', '\u{2965}', '\u{2966}', + '\u{2967}', '\u{2968}', '\u{2969}', '\u{296A}', '\u{296B}', '\u{296C}', '\u{296D}', '\u{296E}', + '\u{296F}', '\u{2970}', '\u{2971}', '\u{2972}', '\u{2973}', '\u{2974}', '\u{2975}', '\u{2976}', + '\u{2977}', '\u{2978}', '\u{2979}', '\u{297A}', '\u{297B}', '\u{297C}', '\u{297D}', '\u{297E}', + '\u{297F}', '\u{2980}', '\u{2981}', '\u{2982}', '\u{2999}', '\u{299A}', '\u{299B}', '\u{299C}', + '\u{299D}', '\u{299E}', '\u{299F}', '\u{29A0}', '\u{29A1}', '\u{29A2}', '\u{29A3}', '\u{29A4}', + '\u{29A5}', '\u{29A6}', '\u{29A7}', '\u{29A8}', '\u{29A9}', '\u{29AA}', '\u{29AB}', '\u{29AC}', + '\u{29AD}', '\u{29AE}', '\u{29AF}', '\u{29B0}', '\u{29B1}', '\u{29B2}', '\u{29B3}', '\u{29B4}', + '\u{29B5}', '\u{29B6}', '\u{29B7}', '\u{29B8}', '\u{29B9}', '\u{29BA}', '\u{29BB}', '\u{29BC}', + '\u{29BD}', '\u{29BE}', '\u{29BF}', '\u{29C0}', '\u{29C1}', '\u{29C2}', '\u{29C3}', '\u{29C4}', + '\u{29C5}', '\u{29C6}', '\u{29C7}', '\u{29C8}', '\u{29C9}', '\u{29CA}', '\u{29CB}', '\u{29CC}', + '\u{29CD}', '\u{29CE}', '\u{29CF}', '\u{29D0}', '\u{29D1}', '\u{29D2}', '\u{29D3}', '\u{29D4}', + '\u{29D5}', '\u{29D6}', '\u{29D7}', '\u{29DC}', '\u{29DD}', '\u{29DE}', '\u{29DF}', '\u{29E0}', + '\u{29E1}', '\u{29E2}', '\u{29E3}', '\u{29E4}', '\u{29E5}', '\u{29E6}', '\u{29E7}', '\u{29E8}', + '\u{29E9}', '\u{29EA}', '\u{29EB}', '\u{29EC}', '\u{29ED}', '\u{29EE}', '\u{29EF}', '\u{29F0}', + '\u{29F1}', '\u{29F2}', '\u{29F3}', '\u{29F4}', '\u{29F5}', '\u{29F6}', '\u{29F7}', '\u{29F8}', + '\u{29F9}', '\u{29FA}', '\u{29FB}', '\u{29FE}', '\u{29FF}', '\u{2A00}', '\u{2A01}', '\u{2A02}', + '\u{2A03}', '\u{2A04}', '\u{2A05}', '\u{2A06}', '\u{2A07}', '\u{2A08}', '\u{2A09}', '\u{2A0A}', + '\u{2A0B}', '\u{2A0C}', '\u{2A0D}', '\u{2A0E}', '\u{2A0F}', '\u{2A10}', '\u{2A11}', '\u{2A12}', + '\u{2A13}', '\u{2A14}', '\u{2A15}', '\u{2A16}', '\u{2A17}', '\u{2A18}', '\u{2A19}', '\u{2A1A}', + '\u{2A1B}', '\u{2A1C}', '\u{2A1D}', '\u{2A1E}', '\u{2A1F}', '\u{2A20}', '\u{2A21}', '\u{2A22}', + '\u{2A23}', '\u{2A24}', '\u{2A25}', '\u{2A26}', '\u{2A27}', '\u{2A28}', '\u{2A29}', '\u{2A2A}', + '\u{2A2B}', '\u{2A2C}', '\u{2A2D}', '\u{2A2E}', '\u{2A2F}', '\u{2A30}', '\u{2A31}', '\u{2A32}', + '\u{2A33}', '\u{2A34}', '\u{2A35}', '\u{2A36}', '\u{2A37}', '\u{2A38}', '\u{2A39}', '\u{2A3A}', + '\u{2A3B}', '\u{2A3C}', '\u{2A3D}', '\u{2A3E}', '\u{2A3F}', '\u{2A40}', '\u{2A41}', '\u{2A42}', + '\u{2A43}', '\u{2A44}', '\u{2A45}', '\u{2A46}', '\u{2A47}', '\u{2A48}', '\u{2A49}', '\u{2A4A}', + '\u{2A4B}', '\u{2A4C}', '\u{2A4D}', '\u{2A4E}', '\u{2A4F}', '\u{2A50}', '\u{2A51}', '\u{2A52}', + '\u{2A53}', '\u{2A54}', '\u{2A55}', '\u{2A56}', '\u{2A57}', '\u{2A58}', '\u{2A59}', '\u{2A5A}', + '\u{2A5B}', '\u{2A5C}', '\u{2A5D}', '\u{2A5E}', '\u{2A5F}', '\u{2A60}', '\u{2A61}', '\u{2A62}', + '\u{2A63}', '\u{2A64}', '\u{2A65}', '\u{2A66}', '\u{2A67}', '\u{2A68}', '\u{2A69}', '\u{2A6A}', + '\u{2A6B}', '\u{2A6C}', '\u{2A6D}', '\u{2A6E}', '\u{2A6F}', '\u{2A70}', '\u{2A71}', '\u{2A72}', + '\u{2A73}', '\u{2A74}', '\u{2A75}', '\u{2A76}', '\u{2A77}', '\u{2A78}', '\u{2A79}', '\u{2A7A}', + '\u{2A7B}', '\u{2A7C}', '\u{2A7D}', '\u{2A7E}', '\u{2A7F}', '\u{2A80}', '\u{2A81}', '\u{2A82}', + '\u{2A83}', '\u{2A84}', '\u{2A85}', '\u{2A86}', '\u{2A87}', '\u{2A88}', '\u{2A89}', '\u{2A8A}', + '\u{2A8B}', '\u{2A8C}', '\u{2A8D}', '\u{2A8E}', '\u{2A8F}', '\u{2A90}', '\u{2A91}', '\u{2A92}', + '\u{2A93}', '\u{2A94}', '\u{2A95}', '\u{2A96}', '\u{2A97}', '\u{2A98}', '\u{2A99}', '\u{2A9A}', + '\u{2A9B}', '\u{2A9C}', '\u{2A9D}', '\u{2A9E}', '\u{2A9F}', '\u{2AA0}', '\u{2AA1}', '\u{2AA2}', + '\u{2AA3}', '\u{2AA4}', '\u{2AA5}', '\u{2AA6}', '\u{2AA7}', '\u{2AA8}', '\u{2AA9}', '\u{2AAA}', + '\u{2AAB}', '\u{2AAC}', '\u{2AAD}', '\u{2AAE}', '\u{2AAF}', '\u{2AB0}', '\u{2AB1}', '\u{2AB2}', + '\u{2AB3}', '\u{2AB4}', '\u{2AB5}', '\u{2AB6}', '\u{2AB7}', '\u{2AB8}', '\u{2AB9}', '\u{2ABA}', + '\u{2ABB}', '\u{2ABC}', '\u{2ABD}', '\u{2ABE}', '\u{2ABF}', '\u{2AC0}', '\u{2AC1}', '\u{2AC2}', + '\u{2AC3}', '\u{2AC4}', '\u{2AC5}', '\u{2AC6}', '\u{2AC7}', '\u{2AC8}', '\u{2AC9}', '\u{2ACA}', + '\u{2ACB}', '\u{2ACC}', '\u{2ACD}', '\u{2ACE}', '\u{2ACF}', '\u{2AD0}', '\u{2AD1}', '\u{2AD2}', + '\u{2AD3}', '\u{2AD4}', '\u{2AD5}', '\u{2AD6}', '\u{2AD7}', '\u{2AD8}', '\u{2AD9}', '\u{2ADA}', + '\u{2ADB}', '\u{2ADC}', '\u{2ADD}', '\u{2ADE}', '\u{2ADF}', '\u{2AE0}', '\u{2AE1}', '\u{2AE2}', + '\u{2AE3}', '\u{2AE4}', '\u{2AE5}', '\u{2AE6}', '\u{2AE7}', '\u{2AE8}', '\u{2AE9}', '\u{2AEA}', + '\u{2AEB}', '\u{2AEC}', '\u{2AED}', '\u{2AEE}', '\u{2AEF}', '\u{2AF0}', '\u{2AF1}', '\u{2AF2}', + '\u{2AF3}', '\u{2AF4}', '\u{2AF5}', '\u{2AF6}', '\u{2AF7}', '\u{2AF8}', '\u{2AF9}', '\u{2AFA}', + '\u{2AFB}', '\u{2AFC}', '\u{2AFD}', '\u{2AFE}', '\u{2AFF}', '\u{2B30}', '\u{2B31}', '\u{2B32}', + '\u{2B33}', '\u{2B34}', '\u{2B35}', '\u{2B36}', '\u{2B37}', '\u{2B38}', '\u{2B39}', '\u{2B3A}', + '\u{2B3B}', '\u{2B3C}', '\u{2B3D}', '\u{2B3E}', '\u{2B3F}', '\u{2B40}', '\u{2B41}', '\u{2B42}', + '\u{2B43}', '\u{2B44}', '\u{2B47}', '\u{2B48}', '\u{2B49}', '\u{2B4A}', '\u{2B4B}', '\u{2B4C}', + '\u{FB29}', '\u{FE62}', '\u{FE64}', '\u{FE65}', '\u{FE66}', '\u{FF0B}', '\u{FF1C}', '\u{FF1D}', + '\u{FF1E}', '\u{FF5C}', '\u{FF5E}', '\u{FFE2}', '\u{FFE9}', '\u{FFEA}', '\u{FFEB}', '\u{FFEC}', + '\u{1D6C1}', '\u{1D6DB}', '\u{1D6FB}', '\u{1D715}', '\u{1D735}', '\u{1D74F}', '\u{1D76F}', '\u{1D789}', + '\u{1D7A9}', '\u{1D7C3}', '\u{1EEF0}', '\u{1EEF1}', + + // Modifier + '\u{005E}', '\u{0060}', '\u{00A8}', '\u{00AF}', '\u{00B4}', '\u{00B8}', '\u{02C2}', '\u{02C3}', + '\u{02C4}', '\u{02C5}', '\u{02D2}', '\u{02D3}', '\u{02D4}', '\u{02D5}', '\u{02D6}', '\u{02D7}', + '\u{02D8}', '\u{02D9}', '\u{02DA}', '\u{02DB}', '\u{02DC}', '\u{02DD}', '\u{02DE}', '\u{02DF}', + '\u{02E5}', '\u{02E6}', '\u{02E7}', '\u{02E8}', '\u{02E9}', '\u{02EA}', '\u{02EB}', '\u{02ED}', + '\u{02EF}', '\u{02F0}', '\u{02F1}', '\u{02F2}', '\u{02F3}', '\u{02F4}', '\u{02F5}', '\u{02F6}', + '\u{02F7}', '\u{02F8}', '\u{02F9}', '\u{02FA}', '\u{02FB}', '\u{02FC}', '\u{02FD}', '\u{02FE}', + '\u{02FF}', '\u{0375}', '\u{0384}', '\u{0385}', '\u{0888}', '\u{1FBD}', '\u{1FBF}', '\u{1FC0}', + '\u{1FC1}', '\u{1FCD}', '\u{1FCE}', '\u{1FCF}', '\u{1FDD}', '\u{1FDE}', '\u{1FDF}', '\u{1FED}', + '\u{1FEE}', '\u{1FEF}', '\u{1FFD}', '\u{1FFE}', '\u{309B}', '\u{309C}', '\u{A700}', '\u{A701}', + '\u{A702}', '\u{A703}', '\u{A704}', '\u{A705}', '\u{A706}', '\u{A707}', '\u{A708}', '\u{A709}', + '\u{A70A}', '\u{A70B}', '\u{A70C}', '\u{A70D}', '\u{A70E}', '\u{A70F}', '\u{A710}', '\u{A711}', + '\u{A712}', '\u{A713}', '\u{A714}', '\u{A715}', '\u{A716}', '\u{A720}', '\u{A721}', '\u{A789}', + '\u{A78A}', '\u{AB5B}', '\u{AB6A}', '\u{AB6B}', '\u{FBB2}', '\u{FBB3}', '\u{FBB4}', '\u{FBB5}', + '\u{FBB6}', '\u{FBB7}', '\u{FBB8}', '\u{FBB9}', '\u{FBBA}', '\u{FBBB}', '\u{FBBC}', '\u{FBBD}', + '\u{FBBE}', '\u{FBBF}', '\u{FBC0}', '\u{FBC1}', '\u{FBC2}', '\u{FF3E}', '\u{FF40}', '\u{FFE3}', + '\u{1F3FB}', '\u{1F3FC}', '\u{1F3FD}', '\u{1F3FE}', '\u{1F3FF}', + + // Other symbols + '\u{00A6}', '\u{00A9}', '\u{00AE}', '\u{00B0}', '\u{0482}', '\u{058D}', '\u{058E}', '\u{060E}', + '\u{060F}', '\u{06DE}', '\u{06E9}', '\u{06FD}', '\u{06FE}', '\u{07F6}', '\u{09FA}', '\u{0B70}', + '\u{0BF3}', '\u{0BF4}', '\u{0BF5}', '\u{0BF6}', '\u{0BF7}', '\u{0BF8}', '\u{0BFA}', '\u{0C7F}', + '\u{0D4F}', '\u{0D79}', '\u{0F01}', '\u{0F02}', '\u{0F03}', '\u{0F13}', '\u{0F15}', '\u{0F16}', + '\u{0F17}', '\u{0F1A}', '\u{0F1B}', '\u{0F1C}', '\u{0F1D}', '\u{0F1E}', '\u{0F1F}', '\u{0F34}', + '\u{0F36}', '\u{0F38}', '\u{0FBE}', '\u{0FBF}', '\u{0FC0}', '\u{0FC1}', '\u{0FC2}', '\u{0FC3}', + '\u{0FC4}', '\u{0FC5}', '\u{0FC7}', '\u{0FC8}', '\u{0FC9}', '\u{0FCA}', '\u{0FCB}', '\u{0FCC}', + '\u{0FCE}', '\u{0FCF}', '\u{0FD5}', '\u{0FD6}', '\u{0FD7}', '\u{0FD8}', '\u{109E}', '\u{109F}', + '\u{1390}', '\u{1391}', '\u{1392}', '\u{1393}', '\u{1394}', '\u{1395}', '\u{1396}', '\u{1397}', + '\u{1398}', '\u{1399}', '\u{166D}', '\u{1940}', '\u{19DE}', '\u{19DF}', '\u{19E0}', '\u{19E1}', + '\u{19E2}', '\u{19E3}', '\u{19E4}', '\u{19E5}', '\u{19E6}', '\u{19E7}', '\u{19E8}', '\u{19E9}', + '\u{19EA}', '\u{19EB}', '\u{19EC}', '\u{19ED}', '\u{19EE}', '\u{19EF}', '\u{19F0}', '\u{19F1}', + '\u{19F2}', '\u{19F3}', '\u{19F4}', '\u{19F5}', '\u{19F6}', '\u{19F7}', '\u{19F8}', '\u{19F9}', + '\u{19FA}', '\u{19FB}', '\u{19FC}', '\u{19FD}', '\u{19FE}', '\u{19FF}', '\u{1B61}', '\u{1B62}', + '\u{1B63}', '\u{1B64}', '\u{1B65}', '\u{1B66}', '\u{1B67}', '\u{1B68}', '\u{1B69}', '\u{1B6A}', + '\u{1B74}', '\u{1B75}', '\u{1B76}', '\u{1B77}', '\u{1B78}', '\u{1B79}', '\u{1B7A}', '\u{1B7B}', + '\u{1B7C}', '\u{2100}', '\u{2101}', '\u{2103}', '\u{2104}', '\u{2105}', '\u{2106}', '\u{2108}', + '\u{2109}', '\u{2114}', '\u{2116}', '\u{2117}', '\u{211E}', '\u{211F}', '\u{2120}', '\u{2121}', + '\u{2122}', '\u{2123}', '\u{2125}', '\u{2127}', '\u{2129}', '\u{212E}', '\u{213A}', '\u{213B}', + '\u{214A}', '\u{214C}', '\u{214D}', '\u{214F}', '\u{218A}', '\u{218B}', '\u{2195}', '\u{2196}', + '\u{2197}', '\u{2198}', '\u{2199}', '\u{219C}', '\u{219D}', '\u{219E}', '\u{219F}', '\u{21A1}', + '\u{21A2}', '\u{21A4}', '\u{21A5}', '\u{21A7}', '\u{21A8}', '\u{21A9}', '\u{21AA}', '\u{21AB}', + '\u{21AC}', '\u{21AD}', '\u{21AF}', '\u{21B0}', '\u{21B1}', '\u{21B2}', '\u{21B3}', '\u{21B4}', + '\u{21B5}', '\u{21B6}', '\u{21B7}', '\u{21B8}', '\u{21B9}', '\u{21BA}', '\u{21BB}', '\u{21BC}', + '\u{21BD}', '\u{21BE}', '\u{21BF}', '\u{21C0}', '\u{21C1}', '\u{21C2}', '\u{21C3}', '\u{21C4}', + '\u{21C5}', '\u{21C6}', '\u{21C7}', '\u{21C8}', '\u{21C9}', '\u{21CA}', '\u{21CB}', '\u{21CC}', + '\u{21CD}', '\u{21D0}', '\u{21D1}', '\u{21D3}', '\u{21D5}', '\u{21D6}', '\u{21D7}', '\u{21D8}', + '\u{21D9}', '\u{21DA}', '\u{21DB}', '\u{21DC}', '\u{21DD}', '\u{21DE}', '\u{21DF}', '\u{21E0}', + '\u{21E1}', '\u{21E2}', '\u{21E3}', '\u{21E4}', '\u{21E5}', '\u{21E6}', '\u{21E7}', '\u{21E8}', + '\u{21E9}', '\u{21EA}', '\u{21EB}', '\u{21EC}', '\u{21ED}', '\u{21EE}', '\u{21EF}', '\u{21F0}', + '\u{21F1}', '\u{21F2}', '\u{21F3}', '\u{2300}', '\u{2301}', '\u{2302}', '\u{2303}', '\u{2304}', + '\u{2305}', '\u{2306}', '\u{2307}', '\u{230C}', '\u{230D}', '\u{230E}', '\u{230F}', '\u{2310}', + '\u{2311}', '\u{2312}', '\u{2313}', '\u{2314}', '\u{2315}', '\u{2316}', '\u{2317}', '\u{2318}', + '\u{2319}', '\u{231A}', '\u{231B}', '\u{231C}', '\u{231D}', '\u{231E}', '\u{231F}', '\u{2322}', + '\u{2323}', '\u{2324}', '\u{2325}', '\u{2326}', '\u{2327}', '\u{2328}', '\u{232B}', '\u{232C}', + '\u{232D}', '\u{232E}', '\u{232F}', '\u{2330}', '\u{2331}', '\u{2332}', '\u{2333}', '\u{2334}', + '\u{2335}', '\u{2336}', '\u{2337}', '\u{2338}', '\u{2339}', '\u{233A}', '\u{233B}', '\u{233C}', + '\u{233D}', '\u{233E}', '\u{233F}', '\u{2340}', '\u{2341}', '\u{2342}', '\u{2343}', '\u{2344}', + '\u{2345}', '\u{2346}', '\u{2347}', '\u{2348}', '\u{2349}', '\u{234A}', '\u{234B}', '\u{234C}', + '\u{234D}', '\u{234E}', '\u{234F}', '\u{2350}', '\u{2351}', '\u{2352}', '\u{2353}', '\u{2354}', + '\u{2355}', '\u{2356}', '\u{2357}', '\u{2358}', '\u{2359}', '\u{235A}', '\u{235B}', '\u{235C}', + '\u{235D}', '\u{235E}', '\u{235F}', '\u{2360}', '\u{2361}', '\u{2362}', '\u{2363}', '\u{2364}', + '\u{2365}', '\u{2366}', '\u{2367}', '\u{2368}', '\u{2369}', '\u{236A}', '\u{236B}', '\u{236C}', + '\u{236D}', '\u{236E}', '\u{236F}', '\u{2370}', '\u{2371}', '\u{2372}', '\u{2373}', '\u{2374}', + '\u{2375}', '\u{2376}', '\u{2377}', '\u{2378}', '\u{2379}', '\u{237A}', '\u{237B}', '\u{237D}', + '\u{237E}', '\u{237F}', '\u{2380}', '\u{2381}', '\u{2382}', '\u{2383}', '\u{2384}', '\u{2385}', + '\u{2386}', '\u{2387}', '\u{2388}', '\u{2389}', '\u{238A}', '\u{238B}', '\u{238C}', '\u{238D}', + '\u{238E}', '\u{238F}', '\u{2390}', '\u{2391}', '\u{2392}', '\u{2393}', '\u{2394}', '\u{2395}', + '\u{2396}', '\u{2397}', '\u{2398}', '\u{2399}', '\u{239A}', '\u{23B4}', '\u{23B5}', '\u{23B6}', + '\u{23B7}', '\u{23B8}', '\u{23B9}', '\u{23BA}', '\u{23BB}', '\u{23BC}', '\u{23BD}', '\u{23BE}', + '\u{23BF}', '\u{23C0}', '\u{23C1}', '\u{23C2}', '\u{23C3}', '\u{23C4}', '\u{23C5}', '\u{23C6}', + '\u{23C7}', '\u{23C8}', '\u{23C9}', '\u{23CA}', '\u{23CB}', '\u{23CC}', '\u{23CD}', '\u{23CE}', + '\u{23CF}', '\u{23D0}', '\u{23D1}', '\u{23D2}', '\u{23D3}', '\u{23D4}', '\u{23D5}', '\u{23D6}', + '\u{23D7}', '\u{23D8}', '\u{23D9}', '\u{23DA}', '\u{23DB}', '\u{23E2}', '\u{23E3}', '\u{23E4}', + '\u{23E5}', '\u{23E6}', '\u{23E7}', '\u{23E8}', '\u{23E9}', '\u{23EA}', '\u{23EB}', '\u{23EC}', + '\u{23ED}', '\u{23EE}', '\u{23EF}', '\u{23F0}', '\u{23F1}', '\u{23F2}', '\u{23F3}', '\u{23F4}', + '\u{23F5}', '\u{23F6}', '\u{23F7}', '\u{23F8}', '\u{23F9}', '\u{23FA}', '\u{23FB}', '\u{23FC}', + '\u{23FD}', '\u{23FE}', '\u{23FF}', '\u{2400}', '\u{2401}', '\u{2402}', '\u{2403}', '\u{2404}', + '\u{2405}', '\u{2406}', '\u{2407}', '\u{2408}', '\u{2409}', '\u{240A}', '\u{240B}', '\u{240C}', + '\u{240D}', '\u{240E}', '\u{240F}', '\u{2410}', '\u{2411}', '\u{2412}', '\u{2413}', '\u{2414}', + '\u{2415}', '\u{2416}', '\u{2417}', '\u{2418}', '\u{2419}', '\u{241A}', '\u{241B}', '\u{241C}', + '\u{241D}', '\u{241E}', '\u{241F}', '\u{2420}', '\u{2421}', '\u{2422}', '\u{2423}', '\u{2424}', + '\u{2425}', '\u{2426}', '\u{2440}', '\u{2441}', '\u{2442}', '\u{2443}', '\u{2444}', '\u{2445}', + '\u{2446}', '\u{2447}', '\u{2448}', '\u{2449}', '\u{244A}', '\u{249C}', '\u{249D}', '\u{249E}', + '\u{249F}', '\u{24A0}', '\u{24A1}', '\u{24A2}', '\u{24A3}', '\u{24A4}', '\u{24A5}', '\u{24A6}', + '\u{24A7}', '\u{24A8}', '\u{24A9}', '\u{24AA}', '\u{24AB}', '\u{24AC}', '\u{24AD}', '\u{24AE}', + '\u{24AF}', '\u{24B0}', '\u{24B1}', '\u{24B2}', '\u{24B3}', '\u{24B4}', '\u{24B5}', '\u{24B6}', + '\u{24B7}', '\u{24B8}', '\u{24B9}', '\u{24BA}', '\u{24BB}', '\u{24BC}', '\u{24BD}', '\u{24BE}', + '\u{24BF}', '\u{24C0}', '\u{24C1}', '\u{24C2}', '\u{24C3}', '\u{24C4}', '\u{24C5}', '\u{24C6}', + '\u{24C7}', '\u{24C8}', '\u{24C9}', '\u{24CA}', '\u{24CB}', '\u{24CC}', '\u{24CD}', '\u{24CE}', + '\u{24CF}', '\u{24D0}', '\u{24D1}', '\u{24D2}', '\u{24D3}', '\u{24D4}', '\u{24D5}', '\u{24D6}', + '\u{24D7}', '\u{24D8}', '\u{24D9}', '\u{24DA}', '\u{24DB}', '\u{24DC}', '\u{24DD}', '\u{24DE}', + '\u{24DF}', '\u{24E0}', '\u{24E1}', '\u{24E2}', '\u{24E3}', '\u{24E4}', '\u{24E5}', '\u{24E6}', + '\u{24E7}', '\u{24E8}', '\u{24E9}', '\u{2500}', '\u{2501}', '\u{2502}', '\u{2503}', '\u{2504}', + '\u{2505}', '\u{2506}', '\u{2507}', '\u{2508}', '\u{2509}', '\u{250A}', '\u{250B}', '\u{250C}', + '\u{250D}', '\u{250E}', '\u{250F}', '\u{2510}', '\u{2511}', '\u{2512}', '\u{2513}', '\u{2514}', + '\u{2515}', '\u{2516}', '\u{2517}', '\u{2518}', '\u{2519}', '\u{251A}', '\u{251B}', '\u{251C}', + '\u{251D}', '\u{251E}', '\u{251F}', '\u{2520}', '\u{2521}', '\u{2522}', '\u{2523}', '\u{2524}', + '\u{2525}', '\u{2526}', '\u{2527}', '\u{2528}', '\u{2529}', '\u{252A}', '\u{252B}', '\u{252C}', + '\u{252D}', '\u{252E}', '\u{252F}', '\u{2530}', '\u{2531}', '\u{2532}', '\u{2533}', '\u{2534}', + '\u{2535}', '\u{2536}', '\u{2537}', '\u{2538}', '\u{2539}', '\u{253A}', '\u{253B}', '\u{253C}', + '\u{253D}', '\u{253E}', '\u{253F}', '\u{2540}', '\u{2541}', '\u{2542}', '\u{2543}', '\u{2544}', + '\u{2545}', '\u{2546}', '\u{2547}', '\u{2548}', '\u{2549}', '\u{254A}', '\u{254B}', '\u{254C}', + '\u{254D}', '\u{254E}', '\u{254F}', '\u{2550}', '\u{2551}', '\u{2552}', '\u{2553}', '\u{2554}', + '\u{2555}', '\u{2556}', '\u{2557}', '\u{2558}', '\u{2559}', '\u{255A}', '\u{255B}', '\u{255C}', + '\u{255D}', '\u{255E}', '\u{255F}', '\u{2560}', '\u{2561}', '\u{2562}', '\u{2563}', '\u{2564}', + '\u{2565}', '\u{2566}', '\u{2567}', '\u{2568}', '\u{2569}', '\u{256A}', '\u{256B}', '\u{256C}', + '\u{256D}', '\u{256E}', '\u{256F}', '\u{2570}', '\u{2571}', '\u{2572}', '\u{2573}', '\u{2574}', + '\u{2575}', '\u{2576}', '\u{2577}', '\u{2578}', '\u{2579}', '\u{257A}', '\u{257B}', '\u{257C}', + '\u{257D}', '\u{257E}', '\u{257F}', '\u{2580}', '\u{2581}', '\u{2582}', '\u{2583}', '\u{2584}', + '\u{2585}', '\u{2586}', '\u{2587}', '\u{2588}', '\u{2589}', '\u{258A}', '\u{258B}', '\u{258C}', + '\u{258D}', '\u{258E}', '\u{258F}', '\u{2590}', '\u{2591}', '\u{2592}', '\u{2593}', '\u{2594}', + '\u{2595}', '\u{2596}', '\u{2597}', '\u{2598}', '\u{2599}', '\u{259A}', '\u{259B}', '\u{259C}', + '\u{259D}', '\u{259E}', '\u{259F}', '\u{25A0}', '\u{25A1}', '\u{25A2}', '\u{25A3}', '\u{25A4}', + '\u{25A5}', '\u{25A6}', '\u{25A7}', '\u{25A8}', '\u{25A9}', '\u{25AA}', '\u{25AB}', '\u{25AC}', + '\u{25AD}', '\u{25AE}', '\u{25AF}', '\u{25B0}', '\u{25B1}', '\u{25B2}', '\u{25B3}', '\u{25B4}', + '\u{25B5}', '\u{25B6}', '\u{25B8}', '\u{25B9}', '\u{25BA}', '\u{25BB}', '\u{25BC}', '\u{25BD}', + '\u{25BE}', '\u{25BF}', '\u{25C0}', '\u{25C2}', '\u{25C3}', '\u{25C4}', '\u{25C5}', '\u{25C6}', + '\u{25C7}', '\u{25C8}', '\u{25C9}', '\u{25CA}', '\u{25CB}', '\u{25CC}', '\u{25CD}', '\u{25CE}', + '\u{25CF}', '\u{25D0}', '\u{25D1}', '\u{25D2}', '\u{25D3}', '\u{25D4}', '\u{25D5}', '\u{25D6}', + '\u{25D7}', '\u{25D8}', '\u{25D9}', '\u{25DA}', '\u{25DB}', '\u{25DC}', '\u{25DD}', '\u{25DE}', + '\u{25DF}', '\u{25E0}', '\u{25E1}', '\u{25E2}', '\u{25E3}', '\u{25E4}', '\u{25E5}', '\u{25E6}', + '\u{25E7}', '\u{25E8}', '\u{25E9}', '\u{25EA}', '\u{25EB}', '\u{25EC}', '\u{25ED}', '\u{25EE}', + '\u{25EF}', '\u{25F0}', '\u{25F1}', '\u{25F2}', '\u{25F3}', '\u{25F4}', '\u{25F5}', '\u{25F6}', + '\u{25F7}', '\u{2600}', '\u{2601}', '\u{2602}', '\u{2603}', '\u{2604}', '\u{2605}', '\u{2606}', + '\u{2607}', '\u{2608}', '\u{2609}', '\u{260A}', '\u{260B}', '\u{260C}', '\u{260D}', '\u{260E}', + '\u{260F}', '\u{2610}', '\u{2611}', '\u{2612}', '\u{2613}', '\u{2614}', '\u{2615}', '\u{2616}', + '\u{2617}', '\u{2618}', '\u{2619}', '\u{261A}', '\u{261B}', '\u{261C}', '\u{261D}', '\u{261E}', + '\u{261F}', '\u{2620}', '\u{2621}', '\u{2622}', '\u{2623}', '\u{2624}', '\u{2625}', '\u{2626}', + '\u{2627}', '\u{2628}', '\u{2629}', '\u{262A}', '\u{262B}', '\u{262C}', '\u{262D}', '\u{262E}', + '\u{262F}', '\u{2630}', '\u{2631}', '\u{2632}', '\u{2633}', '\u{2634}', '\u{2635}', '\u{2636}', + '\u{2637}', '\u{2638}', '\u{2639}', '\u{263A}', '\u{263B}', '\u{263C}', '\u{263D}', '\u{263E}', + '\u{263F}', '\u{2640}', '\u{2641}', '\u{2642}', '\u{2643}', '\u{2644}', '\u{2645}', '\u{2646}', + '\u{2647}', '\u{2648}', '\u{2649}', '\u{264A}', '\u{264B}', '\u{264C}', '\u{264D}', '\u{264E}', + '\u{264F}', '\u{2650}', '\u{2651}', '\u{2652}', '\u{2653}', '\u{2654}', '\u{2655}', '\u{2656}', + '\u{2657}', '\u{2658}', '\u{2659}', '\u{265A}', '\u{265B}', '\u{265C}', '\u{265D}', '\u{265E}', + '\u{265F}', '\u{2660}', '\u{2661}', '\u{2662}', '\u{2663}', '\u{2664}', '\u{2665}', '\u{2666}', + '\u{2667}', '\u{2668}', '\u{2669}', '\u{266A}', '\u{266B}', '\u{266C}', '\u{266D}', '\u{266E}', + '\u{2670}', '\u{2671}', '\u{2672}', '\u{2673}', '\u{2674}', '\u{2675}', '\u{2676}', '\u{2677}', + '\u{2678}', '\u{2679}', '\u{267A}', '\u{267B}', '\u{267C}', '\u{267D}', '\u{267E}', '\u{267F}', + '\u{2680}', '\u{2681}', '\u{2682}', '\u{2683}', '\u{2684}', '\u{2685}', '\u{2686}', '\u{2687}', + '\u{2688}', '\u{2689}', '\u{268A}', '\u{268B}', '\u{268C}', '\u{268D}', '\u{268E}', '\u{268F}', + '\u{2690}', '\u{2691}', '\u{2692}', '\u{2693}', '\u{2694}', '\u{2695}', '\u{2696}', '\u{2697}', + '\u{2698}', '\u{2699}', '\u{269A}', '\u{269B}', '\u{269C}', '\u{269D}', '\u{269E}', '\u{269F}', + '\u{26A0}', '\u{26A1}', '\u{26A2}', '\u{26A3}', '\u{26A4}', '\u{26A5}', '\u{26A6}', '\u{26A7}', + '\u{26A8}', '\u{26A9}', '\u{26AA}', '\u{26AB}', '\u{26AC}', '\u{26AD}', '\u{26AE}', '\u{26AF}', + '\u{26B0}', '\u{26B1}', '\u{26B2}', '\u{26B3}', '\u{26B4}', '\u{26B5}', '\u{26B6}', '\u{26B7}', + '\u{26B8}', '\u{26B9}', '\u{26BA}', '\u{26BB}', '\u{26BC}', '\u{26BD}', '\u{26BE}', '\u{26BF}', + '\u{26C0}', '\u{26C1}', '\u{26C2}', '\u{26C3}', '\u{26C4}', '\u{26C5}', '\u{26C6}', '\u{26C7}', + '\u{26C8}', '\u{26C9}', '\u{26CA}', '\u{26CB}', '\u{26CC}', '\u{26CD}', '\u{26CE}', '\u{26CF}', + '\u{26D0}', '\u{26D1}', '\u{26D2}', '\u{26D3}', '\u{26D4}', '\u{26D5}', '\u{26D6}', '\u{26D7}', + '\u{26D8}', '\u{26D9}', '\u{26DA}', '\u{26DB}', '\u{26DC}', '\u{26DD}', '\u{26DE}', '\u{26DF}', + '\u{26E0}', '\u{26E1}', '\u{26E2}', '\u{26E3}', '\u{26E4}', '\u{26E5}', '\u{26E6}', '\u{26E7}', + '\u{26E8}', '\u{26E9}', '\u{26EA}', '\u{26EB}', '\u{26EC}', '\u{26ED}', '\u{26EE}', '\u{26EF}', + '\u{26F0}', '\u{26F1}', '\u{26F2}', '\u{26F3}', '\u{26F4}', '\u{26F5}', '\u{26F6}', '\u{26F7}', + '\u{26F8}', '\u{26F9}', '\u{26FA}', '\u{26FB}', '\u{26FC}', '\u{26FD}', '\u{26FE}', '\u{26FF}', + '\u{2700}', '\u{2701}', '\u{2702}', '\u{2703}', '\u{2704}', '\u{2705}', '\u{2706}', '\u{2707}', + '\u{2708}', '\u{2709}', '\u{270A}', '\u{270B}', '\u{270C}', '\u{270D}', '\u{270E}', '\u{270F}', + '\u{2710}', '\u{2711}', '\u{2712}', '\u{2713}', '\u{2714}', '\u{2715}', '\u{2716}', '\u{2717}', + '\u{2718}', '\u{2719}', '\u{271A}', '\u{271B}', '\u{271C}', '\u{271D}', '\u{271E}', '\u{271F}', + '\u{2720}', '\u{2721}', '\u{2722}', '\u{2723}', '\u{2724}', '\u{2725}', '\u{2726}', '\u{2727}', + '\u{2728}', '\u{2729}', '\u{272A}', '\u{272B}', '\u{272C}', '\u{272D}', '\u{272E}', '\u{272F}', + '\u{2730}', '\u{2731}', '\u{2732}', '\u{2733}', '\u{2734}', '\u{2735}', '\u{2736}', '\u{2737}', + '\u{2738}', '\u{2739}', '\u{273A}', '\u{273B}', '\u{273C}', '\u{273D}', '\u{273E}', '\u{273F}', + '\u{2740}', '\u{2741}', '\u{2742}', '\u{2743}', '\u{2744}', '\u{2745}', '\u{2746}', '\u{2747}', + '\u{2748}', '\u{2749}', '\u{274A}', '\u{274B}', '\u{274C}', '\u{274D}', '\u{274E}', '\u{274F}', + '\u{2750}', '\u{2751}', '\u{2752}', '\u{2753}', '\u{2754}', '\u{2755}', '\u{2756}', '\u{2757}', + '\u{2758}', '\u{2759}', '\u{275A}', '\u{275B}', '\u{275C}', '\u{275D}', '\u{275E}', '\u{275F}', + '\u{2760}', '\u{2761}', '\u{2762}', '\u{2763}', '\u{2764}', '\u{2765}', '\u{2766}', '\u{2767}', + '\u{2794}', '\u{2795}', '\u{2796}', '\u{2797}', '\u{2798}', '\u{2799}', '\u{279A}', '\u{279B}', + '\u{279C}', '\u{279D}', '\u{279E}', '\u{279F}', '\u{27A0}', '\u{27A1}', '\u{27A2}', '\u{27A3}', + '\u{27A4}', '\u{27A5}', '\u{27A6}', '\u{27A7}', '\u{27A8}', '\u{27A9}', '\u{27AA}', '\u{27AB}', + '\u{27AC}', '\u{27AD}', '\u{27AE}', '\u{27AF}', '\u{27B0}', '\u{27B1}', '\u{27B2}', '\u{27B3}', + '\u{27B4}', '\u{27B5}', '\u{27B6}', '\u{27B7}', '\u{27B8}', '\u{27B9}', '\u{27BA}', '\u{27BB}', + '\u{27BC}', '\u{27BD}', '\u{27BE}', '\u{27BF}', '\u{2800}', '\u{2801}', '\u{2802}', '\u{2803}', + '\u{2804}', '\u{2805}', '\u{2806}', '\u{2807}', '\u{2808}', '\u{2809}', '\u{280A}', '\u{280B}', + '\u{280C}', '\u{280D}', '\u{280E}', '\u{280F}', '\u{2810}', '\u{2811}', '\u{2812}', '\u{2813}', + '\u{2814}', '\u{2815}', '\u{2816}', '\u{2817}', '\u{2818}', '\u{2819}', '\u{281A}', '\u{281B}', + '\u{281C}', '\u{281D}', '\u{281E}', '\u{281F}', '\u{2820}', '\u{2821}', '\u{2822}', '\u{2823}', + '\u{2824}', '\u{2825}', '\u{2826}', '\u{2827}', '\u{2828}', '\u{2829}', '\u{282A}', '\u{282B}', + '\u{282C}', '\u{282D}', '\u{282E}', '\u{282F}', '\u{2830}', '\u{2831}', '\u{2832}', '\u{2833}', + '\u{2834}', '\u{2835}', '\u{2836}', '\u{2837}', '\u{2838}', '\u{2839}', '\u{283A}', '\u{283B}', + '\u{283C}', '\u{283D}', '\u{283E}', '\u{283F}', '\u{2840}', '\u{2841}', '\u{2842}', '\u{2843}', + '\u{2844}', '\u{2845}', '\u{2846}', '\u{2847}', '\u{2848}', '\u{2849}', '\u{284A}', '\u{284B}', + '\u{284C}', '\u{284D}', '\u{284E}', '\u{284F}', '\u{2850}', '\u{2851}', '\u{2852}', '\u{2853}', + '\u{2854}', '\u{2855}', '\u{2856}', '\u{2857}', '\u{2858}', '\u{2859}', '\u{285A}', '\u{285B}', + '\u{285C}', '\u{285D}', '\u{285E}', '\u{285F}', '\u{2860}', '\u{2861}', '\u{2862}', '\u{2863}', + '\u{2864}', '\u{2865}', '\u{2866}', '\u{2867}', '\u{2868}', '\u{2869}', '\u{286A}', '\u{286B}', + '\u{286C}', '\u{286D}', '\u{286E}', '\u{286F}', '\u{2870}', '\u{2871}', '\u{2872}', '\u{2873}', + '\u{2874}', '\u{2875}', '\u{2876}', '\u{2877}', '\u{2878}', '\u{2879}', '\u{287A}', '\u{287B}', + '\u{287C}', '\u{287D}', '\u{287E}', '\u{287F}', '\u{2880}', '\u{2881}', '\u{2882}', '\u{2883}', + '\u{2884}', '\u{2885}', '\u{2886}', '\u{2887}', '\u{2888}', '\u{2889}', '\u{288A}', '\u{288B}', + '\u{288C}', '\u{288D}', '\u{288E}', '\u{288F}', '\u{2890}', '\u{2891}', '\u{2892}', '\u{2893}', + '\u{2894}', '\u{2895}', '\u{2896}', '\u{2897}', '\u{2898}', '\u{2899}', '\u{289A}', '\u{289B}', + '\u{289C}', '\u{289D}', '\u{289E}', '\u{289F}', '\u{28A0}', '\u{28A1}', '\u{28A2}', '\u{28A3}', + '\u{28A4}', '\u{28A5}', '\u{28A6}', '\u{28A7}', '\u{28A8}', '\u{28A9}', '\u{28AA}', '\u{28AB}', + '\u{28AC}', '\u{28AD}', '\u{28AE}', '\u{28AF}', '\u{28B0}', '\u{28B1}', '\u{28B2}', '\u{28B3}', + '\u{28B4}', '\u{28B5}', '\u{28B6}', '\u{28B7}', '\u{28B8}', '\u{28B9}', '\u{28BA}', '\u{28BB}', + '\u{28BC}', '\u{28BD}', '\u{28BE}', '\u{28BF}', '\u{28C0}', '\u{28C1}', '\u{28C2}', '\u{28C3}', + '\u{28C4}', '\u{28C5}', '\u{28C6}', '\u{28C7}', '\u{28C8}', '\u{28C9}', '\u{28CA}', '\u{28CB}', + '\u{28CC}', '\u{28CD}', '\u{28CE}', '\u{28CF}', '\u{28D0}', '\u{28D1}', '\u{28D2}', '\u{28D3}', + '\u{28D4}', '\u{28D5}', '\u{28D6}', '\u{28D7}', '\u{28D8}', '\u{28D9}', '\u{28DA}', '\u{28DB}', + '\u{28DC}', '\u{28DD}', '\u{28DE}', '\u{28DF}', '\u{28E0}', '\u{28E1}', '\u{28E2}', '\u{28E3}', + '\u{28E4}', '\u{28E5}', '\u{28E6}', '\u{28E7}', '\u{28E8}', '\u{28E9}', '\u{28EA}', '\u{28EB}', + '\u{28EC}', '\u{28ED}', '\u{28EE}', '\u{28EF}', '\u{28F0}', '\u{28F1}', '\u{28F2}', '\u{28F3}', + '\u{28F4}', '\u{28F5}', '\u{28F6}', '\u{28F7}', '\u{28F8}', '\u{28F9}', '\u{28FA}', '\u{28FB}', + '\u{28FC}', '\u{28FD}', '\u{28FE}', '\u{28FF}', '\u{2B00}', '\u{2B01}', '\u{2B02}', '\u{2B03}', + '\u{2B04}', '\u{2B05}', '\u{2B06}', '\u{2B07}', '\u{2B08}', '\u{2B09}', '\u{2B0A}', '\u{2B0B}', + '\u{2B0C}', '\u{2B0D}', '\u{2B0E}', '\u{2B0F}', '\u{2B10}', '\u{2B11}', '\u{2B12}', '\u{2B13}', + '\u{2B14}', '\u{2B15}', '\u{2B16}', '\u{2B17}', '\u{2B18}', '\u{2B19}', '\u{2B1A}', '\u{2B1B}', + '\u{2B1C}', '\u{2B1D}', '\u{2B1E}', '\u{2B1F}', '\u{2B20}', '\u{2B21}', '\u{2B22}', '\u{2B23}', + '\u{2B24}', '\u{2B25}', '\u{2B26}', '\u{2B27}', '\u{2B28}', '\u{2B29}', '\u{2B2A}', '\u{2B2B}', + '\u{2B2C}', '\u{2B2D}', '\u{2B2E}', '\u{2B2F}', '\u{2B45}', '\u{2B46}', '\u{2B4D}', '\u{2B4E}', + '\u{2B4F}', '\u{2B50}', '\u{2B51}', '\u{2B52}', '\u{2B53}', '\u{2B54}', '\u{2B55}', '\u{2B56}', + '\u{2B57}', '\u{2B58}', '\u{2B59}', '\u{2B5A}', '\u{2B5B}', '\u{2B5C}', '\u{2B5D}', '\u{2B5E}', + '\u{2B5F}', '\u{2B60}', '\u{2B61}', '\u{2B62}', '\u{2B63}', '\u{2B64}', '\u{2B65}', '\u{2B66}', + '\u{2B67}', '\u{2B68}', '\u{2B69}', '\u{2B6A}', '\u{2B6B}', '\u{2B6C}', '\u{2B6D}', '\u{2B6E}', + '\u{2B6F}', '\u{2B70}', '\u{2B71}', '\u{2B72}', '\u{2B73}', '\u{2B76}', '\u{2B77}', '\u{2B78}', + '\u{2B79}', '\u{2B7A}', '\u{2B7B}', '\u{2B7C}', '\u{2B7D}', '\u{2B7E}', '\u{2B7F}', '\u{2B80}', + '\u{2B81}', '\u{2B82}', '\u{2B83}', '\u{2B84}', '\u{2B85}', '\u{2B86}', '\u{2B87}', '\u{2B88}', + '\u{2B89}', '\u{2B8A}', '\u{2B8B}', '\u{2B8C}', '\u{2B8D}', '\u{2B8E}', '\u{2B8F}', '\u{2B90}', + '\u{2B91}', '\u{2B92}', '\u{2B93}', '\u{2B94}', '\u{2B95}', '\u{2B97}', '\u{2B98}', '\u{2B99}', + '\u{2B9A}', '\u{2B9B}', '\u{2B9C}', '\u{2B9D}', '\u{2B9E}', '\u{2B9F}', '\u{2BA0}', '\u{2BA1}', + '\u{2BA2}', '\u{2BA3}', '\u{2BA4}', '\u{2BA5}', '\u{2BA6}', '\u{2BA7}', '\u{2BA8}', '\u{2BA9}', + '\u{2BAA}', '\u{2BAB}', '\u{2BAC}', '\u{2BAD}', '\u{2BAE}', '\u{2BAF}', '\u{2BB0}', '\u{2BB1}', + '\u{2BB2}', '\u{2BB3}', '\u{2BB4}', '\u{2BB5}', '\u{2BB6}', '\u{2BB7}', '\u{2BB8}', '\u{2BB9}', + '\u{2BBA}', '\u{2BBB}', '\u{2BBC}', '\u{2BBD}', '\u{2BBE}', '\u{2BBF}', '\u{2BC0}', '\u{2BC1}', + '\u{2BC2}', '\u{2BC3}', '\u{2BC4}', '\u{2BC5}', '\u{2BC6}', '\u{2BC7}', '\u{2BC8}', '\u{2BC9}', + '\u{2BCA}', '\u{2BCB}', '\u{2BCC}', '\u{2BCD}', '\u{2BCE}', '\u{2BCF}', '\u{2BD0}', '\u{2BD1}', + '\u{2BD2}', '\u{2BD3}', '\u{2BD4}', '\u{2BD5}', '\u{2BD6}', '\u{2BD7}', '\u{2BD8}', '\u{2BD9}', + '\u{2BDA}', '\u{2BDB}', '\u{2BDC}', '\u{2BDD}', '\u{2BDE}', '\u{2BDF}', '\u{2BE0}', '\u{2BE1}', + '\u{2BE2}', '\u{2BE3}', '\u{2BE4}', '\u{2BE5}', '\u{2BE6}', '\u{2BE7}', '\u{2BE8}', '\u{2BE9}', + '\u{2BEA}', '\u{2BEB}', '\u{2BEC}', '\u{2BED}', '\u{2BEE}', '\u{2BEF}', '\u{2BF0}', '\u{2BF1}', + '\u{2BF2}', '\u{2BF3}', '\u{2BF4}', '\u{2BF5}', '\u{2BF6}', '\u{2BF7}', '\u{2BF8}', '\u{2BF9}', + '\u{2BFA}', '\u{2BFB}', '\u{2BFC}', '\u{2BFD}', '\u{2BFE}', '\u{2BFF}', '\u{2CE5}', '\u{2CE6}', + '\u{2CE7}', '\u{2CE8}', '\u{2CE9}', '\u{2CEA}', '\u{2E50}', '\u{2E51}', '\u{2E80}', '\u{2E81}', + '\u{2E82}', '\u{2E83}', '\u{2E84}', '\u{2E85}', '\u{2E86}', '\u{2E87}', '\u{2E88}', '\u{2E89}', + '\u{2E8A}', '\u{2E8B}', '\u{2E8C}', '\u{2E8D}', '\u{2E8E}', '\u{2E8F}', '\u{2E90}', '\u{2E91}', + '\u{2E92}', '\u{2E93}', '\u{2E94}', '\u{2E95}', '\u{2E96}', '\u{2E97}', '\u{2E98}', '\u{2E99}', + '\u{2E9B}', '\u{2E9C}', '\u{2E9D}', '\u{2E9E}', '\u{2E9F}', '\u{2EA0}', '\u{2EA1}', '\u{2EA2}', + '\u{2EA3}', '\u{2EA4}', '\u{2EA5}', '\u{2EA6}', '\u{2EA7}', '\u{2EA8}', '\u{2EA9}', '\u{2EAA}', + '\u{2EAB}', '\u{2EAC}', '\u{2EAD}', '\u{2EAE}', '\u{2EAF}', '\u{2EB0}', '\u{2EB1}', '\u{2EB2}', + '\u{2EB3}', '\u{2EB4}', '\u{2EB5}', '\u{2EB6}', '\u{2EB7}', '\u{2EB8}', '\u{2EB9}', '\u{2EBA}', + '\u{2EBB}', '\u{2EBC}', '\u{2EBD}', '\u{2EBE}', '\u{2EBF}', '\u{2EC0}', '\u{2EC1}', '\u{2EC2}', + '\u{2EC3}', '\u{2EC4}', '\u{2EC5}', '\u{2EC6}', '\u{2EC7}', '\u{2EC8}', '\u{2EC9}', '\u{2ECA}', + '\u{2ECB}', '\u{2ECC}', '\u{2ECD}', '\u{2ECE}', '\u{2ECF}', '\u{2ED0}', '\u{2ED1}', '\u{2ED2}', + '\u{2ED3}', '\u{2ED4}', '\u{2ED5}', '\u{2ED6}', '\u{2ED7}', '\u{2ED8}', '\u{2ED9}', '\u{2EDA}', + '\u{2EDB}', '\u{2EDC}', '\u{2EDD}', '\u{2EDE}', '\u{2EDF}', '\u{2EE0}', '\u{2EE1}', '\u{2EE2}', + '\u{2EE3}', '\u{2EE4}', '\u{2EE5}', '\u{2EE6}', '\u{2EE7}', '\u{2EE8}', '\u{2EE9}', '\u{2EEA}', + '\u{2EEB}', '\u{2EEC}', '\u{2EED}', '\u{2EEE}', '\u{2EEF}', '\u{2EF0}', '\u{2EF1}', '\u{2EF2}', + '\u{2EF3}', '\u{2F00}', '\u{2F01}', '\u{2F02}', '\u{2F03}', '\u{2F04}', '\u{2F05}', '\u{2F06}', + '\u{2F07}', '\u{2F08}', '\u{2F09}', '\u{2F0A}', '\u{2F0B}', '\u{2F0C}', '\u{2F0D}', '\u{2F0E}', + '\u{2F0F}', '\u{2F10}', '\u{2F11}', '\u{2F12}', '\u{2F13}', '\u{2F14}', '\u{2F15}', '\u{2F16}', + '\u{2F17}', '\u{2F18}', '\u{2F19}', '\u{2F1A}', '\u{2F1B}', '\u{2F1C}', '\u{2F1D}', '\u{2F1E}', + '\u{2F1F}', '\u{2F20}', '\u{2F21}', '\u{2F22}', '\u{2F23}', '\u{2F24}', '\u{2F25}', '\u{2F26}', + '\u{2F27}', '\u{2F28}', '\u{2F29}', '\u{2F2A}', '\u{2F2B}', '\u{2F2C}', '\u{2F2D}', '\u{2F2E}', + '\u{2F2F}', '\u{2F30}', '\u{2F31}', '\u{2F32}', '\u{2F33}', '\u{2F34}', '\u{2F35}', '\u{2F36}', + '\u{2F37}', '\u{2F38}', '\u{2F39}', '\u{2F3A}', '\u{2F3B}', '\u{2F3C}', '\u{2F3D}', '\u{2F3E}', + '\u{2F3F}', '\u{2F40}', '\u{2F41}', '\u{2F42}', '\u{2F43}', '\u{2F44}', '\u{2F45}', '\u{2F46}', + '\u{2F47}', '\u{2F48}', '\u{2F49}', '\u{2F4A}', '\u{2F4B}', '\u{2F4C}', '\u{2F4D}', '\u{2F4E}', + '\u{2F4F}', '\u{2F50}', '\u{2F51}', '\u{2F52}', '\u{2F53}', '\u{2F54}', '\u{2F55}', '\u{2F56}', + '\u{2F57}', '\u{2F58}', '\u{2F59}', '\u{2F5A}', '\u{2F5B}', '\u{2F5C}', '\u{2F5D}', '\u{2F5E}', + '\u{2F5F}', '\u{2F60}', '\u{2F61}', '\u{2F62}', '\u{2F63}', '\u{2F64}', '\u{2F65}', '\u{2F66}', + '\u{2F67}', '\u{2F68}', '\u{2F69}', '\u{2F6A}', '\u{2F6B}', '\u{2F6C}', '\u{2F6D}', '\u{2F6E}', + '\u{2F6F}', '\u{2F70}', '\u{2F71}', '\u{2F72}', '\u{2F73}', '\u{2F74}', '\u{2F75}', '\u{2F76}', + '\u{2F77}', '\u{2F78}', '\u{2F79}', '\u{2F7A}', '\u{2F7B}', '\u{2F7C}', '\u{2F7D}', '\u{2F7E}', + '\u{2F7F}', '\u{2F80}', '\u{2F81}', '\u{2F82}', '\u{2F83}', '\u{2F84}', '\u{2F85}', '\u{2F86}', + '\u{2F87}', '\u{2F88}', '\u{2F89}', '\u{2F8A}', '\u{2F8B}', '\u{2F8C}', '\u{2F8D}', '\u{2F8E}', + '\u{2F8F}', '\u{2F90}', '\u{2F91}', '\u{2F92}', '\u{2F93}', '\u{2F94}', '\u{2F95}', '\u{2F96}', + '\u{2F97}', '\u{2F98}', '\u{2F99}', '\u{2F9A}', '\u{2F9B}', '\u{2F9C}', '\u{2F9D}', '\u{2F9E}', + '\u{2F9F}', '\u{2FA0}', '\u{2FA1}', '\u{2FA2}', '\u{2FA3}', '\u{2FA4}', '\u{2FA5}', '\u{2FA6}', + '\u{2FA7}', '\u{2FA8}', '\u{2FA9}', '\u{2FAA}', '\u{2FAB}', '\u{2FAC}', '\u{2FAD}', '\u{2FAE}', + '\u{2FAF}', '\u{2FB0}', '\u{2FB1}', '\u{2FB2}', '\u{2FB3}', '\u{2FB4}', '\u{2FB5}', '\u{2FB6}', + '\u{2FB7}', '\u{2FB8}', '\u{2FB9}', '\u{2FBA}', '\u{2FBB}', '\u{2FBC}', '\u{2FBD}', '\u{2FBE}', + '\u{2FBF}', '\u{2FC0}', '\u{2FC1}', '\u{2FC2}', '\u{2FC3}', '\u{2FC4}', '\u{2FC5}', '\u{2FC6}', + '\u{2FC7}', '\u{2FC8}', '\u{2FC9}', '\u{2FCA}', '\u{2FCB}', '\u{2FCC}', '\u{2FCD}', '\u{2FCE}', + '\u{2FCF}', '\u{2FD0}', '\u{2FD1}', '\u{2FD2}', '\u{2FD3}', '\u{2FD4}', '\u{2FD5}', '\u{2FF0}', + '\u{2FF1}', '\u{2FF2}', '\u{2FF3}', '\u{2FF4}', '\u{2FF5}', '\u{2FF6}', '\u{2FF7}', '\u{2FF8}', + '\u{2FF9}', '\u{2FFA}', '\u{2FFB}', '\u{3004}', '\u{3012}', '\u{3013}', '\u{3020}', '\u{3036}', + '\u{3037}', '\u{303E}', '\u{303F}', '\u{3190}', '\u{3191}', '\u{3196}', '\u{3197}', '\u{3198}', + '\u{3199}', '\u{319A}', '\u{319B}', '\u{319C}', '\u{319D}', '\u{319E}', '\u{319F}', '\u{31C0}', + '\u{31C1}', '\u{31C2}', '\u{31C3}', '\u{31C4}', '\u{31C5}', '\u{31C6}', '\u{31C7}', '\u{31C8}', + '\u{31C9}', '\u{31CA}', '\u{31CB}', '\u{31CC}', '\u{31CD}', '\u{31CE}', '\u{31CF}', '\u{31D0}', + '\u{31D1}', '\u{31D2}', '\u{31D3}', '\u{31D4}', '\u{31D5}', '\u{31D6}', '\u{31D7}', '\u{31D8}', + '\u{31D9}', '\u{31DA}', '\u{31DB}', '\u{31DC}', '\u{31DD}', '\u{31DE}', '\u{31DF}', '\u{31E0}', + '\u{31E1}', '\u{31E2}', '\u{31E3}', '\u{3200}', '\u{3201}', '\u{3202}', '\u{3203}', '\u{3204}', + '\u{3205}', '\u{3206}', '\u{3207}', '\u{3208}', '\u{3209}', '\u{320A}', '\u{320B}', '\u{320C}', + '\u{320D}', '\u{320E}', '\u{320F}', '\u{3210}', '\u{3211}', '\u{3212}', '\u{3213}', '\u{3214}', + '\u{3215}', '\u{3216}', '\u{3217}', '\u{3218}', '\u{3219}', '\u{321A}', '\u{321B}', '\u{321C}', + '\u{321D}', '\u{321E}', '\u{322A}', '\u{322B}', '\u{322C}', '\u{322D}', '\u{322E}', '\u{322F}', + '\u{3230}', '\u{3231}', '\u{3232}', '\u{3233}', '\u{3234}', '\u{3235}', '\u{3236}', '\u{3237}', + '\u{3238}', '\u{3239}', '\u{323A}', '\u{323B}', '\u{323C}', '\u{323D}', '\u{323E}', '\u{323F}', + '\u{3240}', '\u{3241}', '\u{3242}', '\u{3243}', '\u{3244}', '\u{3245}', '\u{3246}', '\u{3247}', + '\u{3250}', '\u{3260}', '\u{3261}', '\u{3262}', '\u{3263}', '\u{3264}', '\u{3265}', '\u{3266}', + '\u{3267}', '\u{3268}', '\u{3269}', '\u{326A}', '\u{326B}', '\u{326C}', '\u{326D}', '\u{326E}', + '\u{326F}', '\u{3270}', '\u{3271}', '\u{3272}', '\u{3273}', '\u{3274}', '\u{3275}', '\u{3276}', + '\u{3277}', '\u{3278}', '\u{3279}', '\u{327A}', '\u{327B}', '\u{327C}', '\u{327D}', '\u{327E}', + '\u{327F}', '\u{328A}', '\u{328B}', '\u{328C}', '\u{328D}', '\u{328E}', '\u{328F}', '\u{3290}', + '\u{3291}', '\u{3292}', '\u{3293}', '\u{3294}', '\u{3295}', '\u{3296}', '\u{3297}', '\u{3298}', + '\u{3299}', '\u{329A}', '\u{329B}', '\u{329C}', '\u{329D}', '\u{329E}', '\u{329F}', '\u{32A0}', + '\u{32A1}', '\u{32A2}', '\u{32A3}', '\u{32A4}', '\u{32A5}', '\u{32A6}', '\u{32A7}', '\u{32A8}', + '\u{32A9}', '\u{32AA}', '\u{32AB}', '\u{32AC}', '\u{32AD}', '\u{32AE}', '\u{32AF}', '\u{32B0}', + '\u{32C0}', '\u{32C1}', '\u{32C2}', '\u{32C3}', '\u{32C4}', '\u{32C5}', '\u{32C6}', '\u{32C7}', + '\u{32C8}', '\u{32C9}', '\u{32CA}', '\u{32CB}', '\u{32CC}', '\u{32CD}', '\u{32CE}', '\u{32CF}', + '\u{32D0}', '\u{32D1}', '\u{32D2}', '\u{32D3}', '\u{32D4}', '\u{32D5}', '\u{32D6}', '\u{32D7}', + '\u{32D8}', '\u{32D9}', '\u{32DA}', '\u{32DB}', '\u{32DC}', '\u{32DD}', '\u{32DE}', '\u{32DF}', + '\u{32E0}', '\u{32E1}', '\u{32E2}', '\u{32E3}', '\u{32E4}', '\u{32E5}', '\u{32E6}', '\u{32E7}', + '\u{32E8}', '\u{32E9}', '\u{32EA}', '\u{32EB}', '\u{32EC}', '\u{32ED}', '\u{32EE}', '\u{32EF}', + '\u{32F0}', '\u{32F1}', '\u{32F2}', '\u{32F3}', '\u{32F4}', '\u{32F5}', '\u{32F6}', '\u{32F7}', + '\u{32F8}', '\u{32F9}', '\u{32FA}', '\u{32FB}', '\u{32FC}', '\u{32FD}', '\u{32FE}', '\u{32FF}', + '\u{3300}', '\u{3301}', '\u{3302}', '\u{3303}', '\u{3304}', '\u{3305}', '\u{3306}', '\u{3307}', + '\u{3308}', '\u{3309}', '\u{330A}', '\u{330B}', '\u{330C}', '\u{330D}', '\u{330E}', '\u{330F}', + '\u{3310}', '\u{3311}', '\u{3312}', '\u{3313}', '\u{3314}', '\u{3315}', '\u{3316}', '\u{3317}', + '\u{3318}', '\u{3319}', '\u{331A}', '\u{331B}', '\u{331C}', '\u{331D}', '\u{331E}', '\u{331F}', + '\u{3320}', '\u{3321}', '\u{3322}', '\u{3323}', '\u{3324}', '\u{3325}', '\u{3326}', '\u{3327}', + '\u{3328}', '\u{3329}', '\u{332A}', '\u{332B}', '\u{332C}', '\u{332D}', '\u{332E}', '\u{332F}', + '\u{3330}', '\u{3331}', '\u{3332}', '\u{3333}', '\u{3334}', '\u{3335}', '\u{3336}', '\u{3337}', + '\u{3338}', '\u{3339}', '\u{333A}', '\u{333B}', '\u{333C}', '\u{333D}', '\u{333E}', '\u{333F}', + '\u{3340}', '\u{3341}', '\u{3342}', '\u{3343}', '\u{3344}', '\u{3345}', '\u{3346}', '\u{3347}', + '\u{3348}', '\u{3349}', '\u{334A}', '\u{334B}', '\u{334C}', '\u{334D}', '\u{334E}', '\u{334F}', + '\u{3350}', '\u{3351}', '\u{3352}', '\u{3353}', '\u{3354}', '\u{3355}', '\u{3356}', '\u{3357}', + '\u{3358}', '\u{3359}', '\u{335A}', '\u{335B}', '\u{335C}', '\u{335D}', '\u{335E}', '\u{335F}', + '\u{3360}', '\u{3361}', '\u{3362}', '\u{3363}', '\u{3364}', '\u{3365}', '\u{3366}', '\u{3367}', + '\u{3368}', '\u{3369}', '\u{336A}', '\u{336B}', '\u{336C}', '\u{336D}', '\u{336E}', '\u{336F}', + '\u{3370}', '\u{3371}', '\u{3372}', '\u{3373}', '\u{3374}', '\u{3375}', '\u{3376}', '\u{3377}', + '\u{3378}', '\u{3379}', '\u{337A}', '\u{337B}', '\u{337C}', '\u{337D}', '\u{337E}', '\u{337F}', + '\u{3380}', '\u{3381}', '\u{3382}', '\u{3383}', '\u{3384}', '\u{3385}', '\u{3386}', '\u{3387}', + '\u{3388}', '\u{3389}', '\u{338A}', '\u{338B}', '\u{338C}', '\u{338D}', '\u{338E}', '\u{338F}', + '\u{3390}', '\u{3391}', '\u{3392}', '\u{3393}', '\u{3394}', '\u{3395}', '\u{3396}', '\u{3397}', + '\u{3398}', '\u{3399}', '\u{339A}', '\u{339B}', '\u{339C}', '\u{339D}', '\u{339E}', '\u{339F}', + '\u{33A0}', '\u{33A1}', '\u{33A2}', '\u{33A3}', '\u{33A4}', '\u{33A5}', '\u{33A6}', '\u{33A7}', + '\u{33A8}', '\u{33A9}', '\u{33AA}', '\u{33AB}', '\u{33AC}', '\u{33AD}', '\u{33AE}', '\u{33AF}', + '\u{33B0}', '\u{33B1}', '\u{33B2}', '\u{33B3}', '\u{33B4}', '\u{33B5}', '\u{33B6}', '\u{33B7}', + '\u{33B8}', '\u{33B9}', '\u{33BA}', '\u{33BB}', '\u{33BC}', '\u{33BD}', '\u{33BE}', '\u{33BF}', + '\u{33C0}', '\u{33C1}', '\u{33C2}', '\u{33C3}', '\u{33C4}', '\u{33C5}', '\u{33C6}', '\u{33C7}', + '\u{33C8}', '\u{33C9}', '\u{33CA}', '\u{33CB}', '\u{33CC}', '\u{33CD}', '\u{33CE}', '\u{33CF}', + '\u{33D0}', '\u{33D1}', '\u{33D2}', '\u{33D3}', '\u{33D4}', '\u{33D5}', '\u{33D6}', '\u{33D7}', + '\u{33D8}', '\u{33D9}', '\u{33DA}', '\u{33DB}', '\u{33DC}', '\u{33DD}', '\u{33DE}', '\u{33DF}', + '\u{33E0}', '\u{33E1}', '\u{33E2}', '\u{33E3}', '\u{33E4}', '\u{33E5}', '\u{33E6}', '\u{33E7}', + '\u{33E8}', '\u{33E9}', '\u{33EA}', '\u{33EB}', '\u{33EC}', '\u{33ED}', '\u{33EE}', '\u{33EF}', + '\u{33F0}', '\u{33F1}', '\u{33F2}', '\u{33F3}', '\u{33F4}', '\u{33F5}', '\u{33F6}', '\u{33F7}', + '\u{33F8}', '\u{33F9}', '\u{33FA}', '\u{33FB}', '\u{33FC}', '\u{33FD}', '\u{33FE}', '\u{33FF}', + '\u{4DC0}', '\u{4DC1}', '\u{4DC2}', '\u{4DC3}', '\u{4DC4}', '\u{4DC5}', '\u{4DC6}', '\u{4DC7}', + '\u{4DC8}', '\u{4DC9}', '\u{4DCA}', '\u{4DCB}', '\u{4DCC}', '\u{4DCD}', '\u{4DCE}', '\u{4DCF}', + '\u{4DD0}', '\u{4DD1}', '\u{4DD2}', '\u{4DD3}', '\u{4DD4}', '\u{4DD5}', '\u{4DD6}', '\u{4DD7}', + '\u{4DD8}', '\u{4DD9}', '\u{4DDA}', '\u{4DDB}', '\u{4DDC}', '\u{4DDD}', '\u{4DDE}', '\u{4DDF}', + '\u{4DE0}', '\u{4DE1}', '\u{4DE2}', '\u{4DE3}', '\u{4DE4}', '\u{4DE5}', '\u{4DE6}', '\u{4DE7}', + '\u{4DE8}', '\u{4DE9}', '\u{4DEA}', '\u{4DEB}', '\u{4DEC}', '\u{4DED}', '\u{4DEE}', '\u{4DEF}', + '\u{4DF0}', '\u{4DF1}', '\u{4DF2}', '\u{4DF3}', '\u{4DF4}', '\u{4DF5}', '\u{4DF6}', '\u{4DF7}', + '\u{4DF8}', '\u{4DF9}', '\u{4DFA}', '\u{4DFB}', '\u{4DFC}', '\u{4DFD}', '\u{4DFE}', '\u{4DFF}', + '\u{A490}', '\u{A491}', '\u{A492}', '\u{A493}', '\u{A494}', '\u{A495}', '\u{A496}', '\u{A497}', + '\u{A498}', '\u{A499}', '\u{A49A}', '\u{A49B}', '\u{A49C}', '\u{A49D}', '\u{A49E}', '\u{A49F}', + '\u{A4A0}', '\u{A4A1}', '\u{A4A2}', '\u{A4A3}', '\u{A4A4}', '\u{A4A5}', '\u{A4A6}', '\u{A4A7}', + '\u{A4A8}', '\u{A4A9}', '\u{A4AA}', '\u{A4AB}', '\u{A4AC}', '\u{A4AD}', '\u{A4AE}', '\u{A4AF}', + '\u{A4B0}', '\u{A4B1}', '\u{A4B2}', '\u{A4B3}', '\u{A4B4}', '\u{A4B5}', '\u{A4B6}', '\u{A4B7}', + '\u{A4B8}', '\u{A4B9}', '\u{A4BA}', '\u{A4BB}', '\u{A4BC}', '\u{A4BD}', '\u{A4BE}', '\u{A4BF}', + '\u{A4C0}', '\u{A4C1}', '\u{A4C2}', '\u{A4C3}', '\u{A4C4}', '\u{A4C5}', '\u{A4C6}', '\u{A828}', + '\u{A829}', '\u{A82A}', '\u{A82B}', '\u{A836}', '\u{A837}', '\u{A839}', '\u{AA77}', '\u{AA78}', + '\u{AA79}', '\u{FD40}', '\u{FD41}', '\u{FD42}', '\u{FD43}', '\u{FD44}', '\u{FD45}', '\u{FD46}', + '\u{FD47}', '\u{FD48}', '\u{FD49}', '\u{FD4A}', '\u{FD4B}', '\u{FD4C}', '\u{FD4D}', '\u{FD4E}', + '\u{FD4F}', '\u{FDCF}', '\u{FDFD}', '\u{FDFE}', '\u{FDFF}', '\u{FFE4}', '\u{FFE8}', '\u{FFED}', + '\u{FFEE}', '\u{FFFC}', '\u{FFFD}', '\u{10137}', '\u{10138}', '\u{10139}', '\u{1013A}', '\u{1013B}', + '\u{1013C}', '\u{1013D}', '\u{1013E}', '\u{1013F}', '\u{10179}', '\u{1017A}', '\u{1017B}', '\u{1017C}', + '\u{1017D}', '\u{1017E}', '\u{1017F}', '\u{10180}', '\u{10181}', '\u{10182}', '\u{10183}', '\u{10184}', + '\u{10185}', '\u{10186}', '\u{10187}', '\u{10188}', '\u{10189}', '\u{1018C}', '\u{1018D}', '\u{1018E}', + '\u{10190}', '\u{10191}', '\u{10192}', '\u{10193}', '\u{10194}', '\u{10195}', '\u{10196}', '\u{10197}', + '\u{10198}', '\u{10199}', '\u{1019A}', '\u{1019B}', '\u{1019C}', '\u{101A0}', '\u{101D0}', '\u{101D1}', + '\u{101D2}', '\u{101D3}', '\u{101D4}', '\u{101D5}', '\u{101D6}', '\u{101D7}', '\u{101D8}', '\u{101D9}', + '\u{101DA}', '\u{101DB}', '\u{101DC}', '\u{101DD}', '\u{101DE}', '\u{101DF}', '\u{101E0}', '\u{101E1}', + '\u{101E2}', '\u{101E3}', '\u{101E4}', '\u{101E5}', '\u{101E6}', '\u{101E7}', '\u{101E8}', '\u{101E9}', + '\u{101EA}', '\u{101EB}', '\u{101EC}', '\u{101ED}', '\u{101EE}', '\u{101EF}', '\u{101F0}', '\u{101F1}', + '\u{101F2}', '\u{101F3}', '\u{101F4}', '\u{101F5}', '\u{101F6}', '\u{101F7}', '\u{101F8}', '\u{101F9}', + '\u{101FA}', '\u{101FB}', '\u{101FC}', '\u{10877}', '\u{10878}', '\u{10AC8}', '\u{1173F}', '\u{11FD5}', + '\u{11FD6}', '\u{11FD7}', '\u{11FD8}', '\u{11FD9}', '\u{11FDA}', '\u{11FDB}', '\u{11FDC}', '\u{11FE1}', + '\u{11FE2}', '\u{11FE3}', '\u{11FE4}', '\u{11FE5}', '\u{11FE6}', '\u{11FE7}', '\u{11FE8}', '\u{11FE9}', + '\u{11FEA}', '\u{11FEB}', '\u{11FEC}', '\u{11FED}', '\u{11FEE}', '\u{11FEF}', '\u{11FF0}', '\u{11FF1}', + '\u{16B3C}', '\u{16B3D}', '\u{16B3E}', '\u{16B3F}', '\u{16B45}', '\u{1BC9C}', '\u{1CF50}', '\u{1CF51}', + '\u{1CF52}', '\u{1CF53}', '\u{1CF54}', '\u{1CF55}', '\u{1CF56}', '\u{1CF57}', '\u{1CF58}', '\u{1CF59}', + '\u{1CF5A}', '\u{1CF5B}', '\u{1CF5C}', '\u{1CF5D}', '\u{1CF5E}', '\u{1CF5F}', '\u{1CF60}', '\u{1CF61}', + '\u{1CF62}', '\u{1CF63}', '\u{1CF64}', '\u{1CF65}', '\u{1CF66}', '\u{1CF67}', '\u{1CF68}', '\u{1CF69}', + '\u{1CF6A}', '\u{1CF6B}', '\u{1CF6C}', '\u{1CF6D}', '\u{1CF6E}', '\u{1CF6F}', '\u{1CF70}', '\u{1CF71}', + '\u{1CF72}', '\u{1CF73}', '\u{1CF74}', '\u{1CF75}', '\u{1CF76}', '\u{1CF77}', '\u{1CF78}', '\u{1CF79}', + '\u{1CF7A}', '\u{1CF7B}', '\u{1CF7C}', '\u{1CF7D}', '\u{1CF7E}', '\u{1CF7F}', '\u{1CF80}', '\u{1CF81}', + '\u{1CF82}', '\u{1CF83}', '\u{1CF84}', '\u{1CF85}', '\u{1CF86}', '\u{1CF87}', '\u{1CF88}', '\u{1CF89}', + '\u{1CF8A}', '\u{1CF8B}', '\u{1CF8C}', '\u{1CF8D}', '\u{1CF8E}', '\u{1CF8F}', '\u{1CF90}', '\u{1CF91}', + '\u{1CF92}', '\u{1CF93}', '\u{1CF94}', '\u{1CF95}', '\u{1CF96}', '\u{1CF97}', '\u{1CF98}', '\u{1CF99}', + '\u{1CF9A}', '\u{1CF9B}', '\u{1CF9C}', '\u{1CF9D}', '\u{1CF9E}', '\u{1CF9F}', '\u{1CFA0}', '\u{1CFA1}', + '\u{1CFA2}', '\u{1CFA3}', '\u{1CFA4}', '\u{1CFA5}', '\u{1CFA6}', '\u{1CFA7}', '\u{1CFA8}', '\u{1CFA9}', + '\u{1CFAA}', '\u{1CFAB}', '\u{1CFAC}', '\u{1CFAD}', '\u{1CFAE}', '\u{1CFAF}', '\u{1CFB0}', '\u{1CFB1}', + '\u{1CFB2}', '\u{1CFB3}', '\u{1CFB4}', '\u{1CFB5}', '\u{1CFB6}', '\u{1CFB7}', '\u{1CFB8}', '\u{1CFB9}', + '\u{1CFBA}', '\u{1CFBB}', '\u{1CFBC}', '\u{1CFBD}', '\u{1CFBE}', '\u{1CFBF}', '\u{1CFC0}', '\u{1CFC1}', + '\u{1CFC2}', '\u{1CFC3}', '\u{1D000}', '\u{1D001}', '\u{1D002}', '\u{1D003}', '\u{1D004}', '\u{1D005}', + '\u{1D006}', '\u{1D007}', '\u{1D008}', '\u{1D009}', '\u{1D00A}', '\u{1D00B}', '\u{1D00C}', '\u{1D00D}', + '\u{1D00E}', '\u{1D00F}', '\u{1D010}', '\u{1D011}', '\u{1D012}', '\u{1D013}', '\u{1D014}', '\u{1D015}', + '\u{1D016}', '\u{1D017}', '\u{1D018}', '\u{1D019}', '\u{1D01A}', '\u{1D01B}', '\u{1D01C}', '\u{1D01D}', + '\u{1D01E}', '\u{1D01F}', '\u{1D020}', '\u{1D021}', '\u{1D022}', '\u{1D023}', '\u{1D024}', '\u{1D025}', + '\u{1D026}', '\u{1D027}', '\u{1D028}', '\u{1D029}', '\u{1D02A}', '\u{1D02B}', '\u{1D02C}', '\u{1D02D}', + '\u{1D02E}', '\u{1D02F}', '\u{1D030}', '\u{1D031}', '\u{1D032}', '\u{1D033}', '\u{1D034}', '\u{1D035}', + '\u{1D036}', '\u{1D037}', '\u{1D038}', '\u{1D039}', '\u{1D03A}', '\u{1D03B}', '\u{1D03C}', '\u{1D03D}', + '\u{1D03E}', '\u{1D03F}', '\u{1D040}', '\u{1D041}', '\u{1D042}', '\u{1D043}', '\u{1D044}', '\u{1D045}', + '\u{1D046}', '\u{1D047}', '\u{1D048}', '\u{1D049}', '\u{1D04A}', '\u{1D04B}', '\u{1D04C}', '\u{1D04D}', + '\u{1D04E}', '\u{1D04F}', '\u{1D050}', '\u{1D051}', '\u{1D052}', '\u{1D053}', '\u{1D054}', '\u{1D055}', + '\u{1D056}', '\u{1D057}', '\u{1D058}', '\u{1D059}', '\u{1D05A}', '\u{1D05B}', '\u{1D05C}', '\u{1D05D}', + '\u{1D05E}', '\u{1D05F}', '\u{1D060}', '\u{1D061}', '\u{1D062}', '\u{1D063}', '\u{1D064}', '\u{1D065}', + '\u{1D066}', '\u{1D067}', '\u{1D068}', '\u{1D069}', '\u{1D06A}', '\u{1D06B}', '\u{1D06C}', '\u{1D06D}', + '\u{1D06E}', '\u{1D06F}', '\u{1D070}', '\u{1D071}', '\u{1D072}', '\u{1D073}', '\u{1D074}', '\u{1D075}', + '\u{1D076}', '\u{1D077}', '\u{1D078}', '\u{1D079}', '\u{1D07A}', '\u{1D07B}', '\u{1D07C}', '\u{1D07D}', + '\u{1D07E}', '\u{1D07F}', '\u{1D080}', '\u{1D081}', '\u{1D082}', '\u{1D083}', '\u{1D084}', '\u{1D085}', + '\u{1D086}', '\u{1D087}', '\u{1D088}', '\u{1D089}', '\u{1D08A}', '\u{1D08B}', '\u{1D08C}', '\u{1D08D}', + '\u{1D08E}', '\u{1D08F}', '\u{1D090}', '\u{1D091}', '\u{1D092}', '\u{1D093}', '\u{1D094}', '\u{1D095}', + '\u{1D096}', '\u{1D097}', '\u{1D098}', '\u{1D099}', '\u{1D09A}', '\u{1D09B}', '\u{1D09C}', '\u{1D09D}', + '\u{1D09E}', '\u{1D09F}', '\u{1D0A0}', '\u{1D0A1}', '\u{1D0A2}', '\u{1D0A3}', '\u{1D0A4}', '\u{1D0A5}', + '\u{1D0A6}', '\u{1D0A7}', '\u{1D0A8}', '\u{1D0A9}', '\u{1D0AA}', '\u{1D0AB}', '\u{1D0AC}', '\u{1D0AD}', + '\u{1D0AE}', '\u{1D0AF}', '\u{1D0B0}', '\u{1D0B1}', '\u{1D0B2}', '\u{1D0B3}', '\u{1D0B4}', '\u{1D0B5}', + '\u{1D0B6}', '\u{1D0B7}', '\u{1D0B8}', '\u{1D0B9}', '\u{1D0BA}', '\u{1D0BB}', '\u{1D0BC}', '\u{1D0BD}', + '\u{1D0BE}', '\u{1D0BF}', '\u{1D0C0}', '\u{1D0C1}', '\u{1D0C2}', '\u{1D0C3}', '\u{1D0C4}', '\u{1D0C5}', + '\u{1D0C6}', '\u{1D0C7}', '\u{1D0C8}', '\u{1D0C9}', '\u{1D0CA}', '\u{1D0CB}', '\u{1D0CC}', '\u{1D0CD}', + '\u{1D0CE}', '\u{1D0CF}', '\u{1D0D0}', '\u{1D0D1}', '\u{1D0D2}', '\u{1D0D3}', '\u{1D0D4}', '\u{1D0D5}', + '\u{1D0D6}', '\u{1D0D7}', '\u{1D0D8}', '\u{1D0D9}', '\u{1D0DA}', '\u{1D0DB}', '\u{1D0DC}', '\u{1D0DD}', + '\u{1D0DE}', '\u{1D0DF}', '\u{1D0E0}', '\u{1D0E1}', '\u{1D0E2}', '\u{1D0E3}', '\u{1D0E4}', '\u{1D0E5}', + '\u{1D0E6}', '\u{1D0E7}', '\u{1D0E8}', '\u{1D0E9}', '\u{1D0EA}', '\u{1D0EB}', '\u{1D0EC}', '\u{1D0ED}', + '\u{1D0EE}', '\u{1D0EF}', '\u{1D0F0}', '\u{1D0F1}', '\u{1D0F2}', '\u{1D0F3}', '\u{1D0F4}', '\u{1D0F5}', + '\u{1D100}', '\u{1D101}', '\u{1D102}', '\u{1D103}', '\u{1D104}', '\u{1D105}', '\u{1D106}', '\u{1D107}', + '\u{1D108}', '\u{1D109}', '\u{1D10A}', '\u{1D10B}', '\u{1D10C}', '\u{1D10D}', '\u{1D10E}', '\u{1D10F}', + '\u{1D110}', '\u{1D111}', '\u{1D112}', '\u{1D113}', '\u{1D114}', '\u{1D115}', '\u{1D116}', '\u{1D117}', + '\u{1D118}', '\u{1D119}', '\u{1D11A}', '\u{1D11B}', '\u{1D11C}', '\u{1D11D}', '\u{1D11E}', '\u{1D11F}', + '\u{1D120}', '\u{1D121}', '\u{1D122}', '\u{1D123}', '\u{1D124}', '\u{1D125}', '\u{1D126}', '\u{1D129}', + '\u{1D12A}', '\u{1D12B}', '\u{1D12C}', '\u{1D12D}', '\u{1D12E}', '\u{1D12F}', '\u{1D130}', '\u{1D131}', + '\u{1D132}', '\u{1D133}', '\u{1D134}', '\u{1D135}', '\u{1D136}', '\u{1D137}', '\u{1D138}', '\u{1D139}', + '\u{1D13A}', '\u{1D13B}', '\u{1D13C}', '\u{1D13D}', '\u{1D13E}', '\u{1D13F}', '\u{1D140}', '\u{1D141}', + '\u{1D142}', '\u{1D143}', '\u{1D144}', '\u{1D145}', '\u{1D146}', '\u{1D147}', '\u{1D148}', '\u{1D149}', + '\u{1D14A}', '\u{1D14B}', '\u{1D14C}', '\u{1D14D}', '\u{1D14E}', '\u{1D14F}', '\u{1D150}', '\u{1D151}', + '\u{1D152}', '\u{1D153}', '\u{1D154}', '\u{1D155}', '\u{1D156}', '\u{1D157}', '\u{1D158}', '\u{1D159}', + '\u{1D15A}', '\u{1D15B}', '\u{1D15C}', '\u{1D15D}', '\u{1D15E}', '\u{1D15F}', '\u{1D160}', '\u{1D161}', + '\u{1D162}', '\u{1D163}', '\u{1D164}', '\u{1D16A}', '\u{1D16B}', '\u{1D16C}', '\u{1D183}', '\u{1D184}', + '\u{1D18C}', '\u{1D18D}', '\u{1D18E}', '\u{1D18F}', '\u{1D190}', '\u{1D191}', '\u{1D192}', '\u{1D193}', + '\u{1D194}', '\u{1D195}', '\u{1D196}', '\u{1D197}', '\u{1D198}', '\u{1D199}', '\u{1D19A}', '\u{1D19B}', + '\u{1D19C}', '\u{1D19D}', '\u{1D19E}', '\u{1D19F}', '\u{1D1A0}', '\u{1D1A1}', '\u{1D1A2}', '\u{1D1A3}', + '\u{1D1A4}', '\u{1D1A5}', '\u{1D1A6}', '\u{1D1A7}', '\u{1D1A8}', '\u{1D1A9}', '\u{1D1AE}', '\u{1D1AF}', + '\u{1D1B0}', '\u{1D1B1}', '\u{1D1B2}', '\u{1D1B3}', '\u{1D1B4}', '\u{1D1B5}', '\u{1D1B6}', '\u{1D1B7}', + '\u{1D1B8}', '\u{1D1B9}', '\u{1D1BA}', '\u{1D1BB}', '\u{1D1BC}', '\u{1D1BD}', '\u{1D1BE}', '\u{1D1BF}', + '\u{1D1C0}', '\u{1D1C1}', '\u{1D1C2}', '\u{1D1C3}', '\u{1D1C4}', '\u{1D1C5}', '\u{1D1C6}', '\u{1D1C7}', + '\u{1D1C8}', '\u{1D1C9}', '\u{1D1CA}', '\u{1D1CB}', '\u{1D1CC}', '\u{1D1CD}', '\u{1D1CE}', '\u{1D1CF}', + '\u{1D1D0}', '\u{1D1D1}', '\u{1D1D2}', '\u{1D1D3}', '\u{1D1D4}', '\u{1D1D5}', '\u{1D1D6}', '\u{1D1D7}', + '\u{1D1D8}', '\u{1D1D9}', '\u{1D1DA}', '\u{1D1DB}', '\u{1D1DC}', '\u{1D1DD}', '\u{1D1DE}', '\u{1D1DF}', + '\u{1D1E0}', '\u{1D1E1}', '\u{1D1E2}', '\u{1D1E3}', '\u{1D1E4}', '\u{1D1E5}', '\u{1D1E6}', '\u{1D1E7}', + '\u{1D1E8}', '\u{1D1E9}', '\u{1D1EA}', '\u{1D200}', '\u{1D201}', '\u{1D202}', '\u{1D203}', '\u{1D204}', + '\u{1D205}', '\u{1D206}', '\u{1D207}', '\u{1D208}', '\u{1D209}', '\u{1D20A}', '\u{1D20B}', '\u{1D20C}', + '\u{1D20D}', '\u{1D20E}', '\u{1D20F}', '\u{1D210}', '\u{1D211}', '\u{1D212}', '\u{1D213}', '\u{1D214}', + '\u{1D215}', '\u{1D216}', '\u{1D217}', '\u{1D218}', '\u{1D219}', '\u{1D21A}', '\u{1D21B}', '\u{1D21C}', + '\u{1D21D}', '\u{1D21E}', '\u{1D21F}', '\u{1D220}', '\u{1D221}', '\u{1D222}', '\u{1D223}', '\u{1D224}', + '\u{1D225}', '\u{1D226}', '\u{1D227}', '\u{1D228}', '\u{1D229}', '\u{1D22A}', '\u{1D22B}', '\u{1D22C}', + '\u{1D22D}', '\u{1D22E}', '\u{1D22F}', '\u{1D230}', '\u{1D231}', '\u{1D232}', '\u{1D233}', '\u{1D234}', + '\u{1D235}', '\u{1D236}', '\u{1D237}', '\u{1D238}', '\u{1D239}', '\u{1D23A}', '\u{1D23B}', '\u{1D23C}', + '\u{1D23D}', '\u{1D23E}', '\u{1D23F}', '\u{1D240}', '\u{1D241}', '\u{1D245}', '\u{1D300}', '\u{1D301}', + '\u{1D302}', '\u{1D303}', '\u{1D304}', '\u{1D305}', '\u{1D306}', '\u{1D307}', '\u{1D308}', '\u{1D309}', + '\u{1D30A}', '\u{1D30B}', '\u{1D30C}', '\u{1D30D}', '\u{1D30E}', '\u{1D30F}', '\u{1D310}', '\u{1D311}', + '\u{1D312}', '\u{1D313}', '\u{1D314}', '\u{1D315}', '\u{1D316}', '\u{1D317}', '\u{1D318}', '\u{1D319}', + '\u{1D31A}', '\u{1D31B}', '\u{1D31C}', '\u{1D31D}', '\u{1D31E}', '\u{1D31F}', '\u{1D320}', '\u{1D321}', + '\u{1D322}', '\u{1D323}', '\u{1D324}', '\u{1D325}', '\u{1D326}', '\u{1D327}', '\u{1D328}', '\u{1D329}', + '\u{1D32A}', '\u{1D32B}', '\u{1D32C}', '\u{1D32D}', '\u{1D32E}', '\u{1D32F}', '\u{1D330}', '\u{1D331}', + '\u{1D332}', '\u{1D333}', '\u{1D334}', '\u{1D335}', '\u{1D336}', '\u{1D337}', '\u{1D338}', '\u{1D339}', + '\u{1D33A}', '\u{1D33B}', '\u{1D33C}', '\u{1D33D}', '\u{1D33E}', '\u{1D33F}', '\u{1D340}', '\u{1D341}', + '\u{1D342}', '\u{1D343}', '\u{1D344}', '\u{1D345}', '\u{1D346}', '\u{1D347}', '\u{1D348}', '\u{1D349}', + '\u{1D34A}', '\u{1D34B}', '\u{1D34C}', '\u{1D34D}', '\u{1D34E}', '\u{1D34F}', '\u{1D350}', '\u{1D351}', + '\u{1D352}', '\u{1D353}', '\u{1D354}', '\u{1D355}', '\u{1D356}', '\u{1D800}', '\u{1D801}', '\u{1D802}', + '\u{1D803}', '\u{1D804}', '\u{1D805}', '\u{1D806}', '\u{1D807}', '\u{1D808}', '\u{1D809}', '\u{1D80A}', + '\u{1D80B}', '\u{1D80C}', '\u{1D80D}', '\u{1D80E}', '\u{1D80F}', '\u{1D810}', '\u{1D811}', '\u{1D812}', + '\u{1D813}', '\u{1D814}', '\u{1D815}', '\u{1D816}', '\u{1D817}', '\u{1D818}', '\u{1D819}', '\u{1D81A}', + '\u{1D81B}', '\u{1D81C}', '\u{1D81D}', '\u{1D81E}', '\u{1D81F}', '\u{1D820}', '\u{1D821}', '\u{1D822}', + '\u{1D823}', '\u{1D824}', '\u{1D825}', '\u{1D826}', '\u{1D827}', '\u{1D828}', '\u{1D829}', '\u{1D82A}', + '\u{1D82B}', '\u{1D82C}', '\u{1D82D}', '\u{1D82E}', '\u{1D82F}', '\u{1D830}', '\u{1D831}', '\u{1D832}', + '\u{1D833}', '\u{1D834}', '\u{1D835}', '\u{1D836}', '\u{1D837}', '\u{1D838}', '\u{1D839}', '\u{1D83A}', + '\u{1D83B}', '\u{1D83C}', '\u{1D83D}', '\u{1D83E}', '\u{1D83F}', '\u{1D840}', '\u{1D841}', '\u{1D842}', + '\u{1D843}', '\u{1D844}', '\u{1D845}', '\u{1D846}', '\u{1D847}', '\u{1D848}', '\u{1D849}', '\u{1D84A}', + '\u{1D84B}', '\u{1D84C}', '\u{1D84D}', '\u{1D84E}', '\u{1D84F}', '\u{1D850}', '\u{1D851}', '\u{1D852}', + '\u{1D853}', '\u{1D854}', '\u{1D855}', '\u{1D856}', '\u{1D857}', '\u{1D858}', '\u{1D859}', '\u{1D85A}', + '\u{1D85B}', '\u{1D85C}', '\u{1D85D}', '\u{1D85E}', '\u{1D85F}', '\u{1D860}', '\u{1D861}', '\u{1D862}', + '\u{1D863}', '\u{1D864}', '\u{1D865}', '\u{1D866}', '\u{1D867}', '\u{1D868}', '\u{1D869}', '\u{1D86A}', + '\u{1D86B}', '\u{1D86C}', '\u{1D86D}', '\u{1D86E}', '\u{1D86F}', '\u{1D870}', '\u{1D871}', '\u{1D872}', + '\u{1D873}', '\u{1D874}', '\u{1D875}', '\u{1D876}', '\u{1D877}', '\u{1D878}', '\u{1D879}', '\u{1D87A}', + '\u{1D87B}', '\u{1D87C}', '\u{1D87D}', '\u{1D87E}', '\u{1D87F}', '\u{1D880}', '\u{1D881}', '\u{1D882}', + '\u{1D883}', '\u{1D884}', '\u{1D885}', '\u{1D886}', '\u{1D887}', '\u{1D888}', '\u{1D889}', '\u{1D88A}', + '\u{1D88B}', '\u{1D88C}', '\u{1D88D}', '\u{1D88E}', '\u{1D88F}', '\u{1D890}', '\u{1D891}', '\u{1D892}', + '\u{1D893}', '\u{1D894}', '\u{1D895}', '\u{1D896}', '\u{1D897}', '\u{1D898}', '\u{1D899}', '\u{1D89A}', + '\u{1D89B}', '\u{1D89C}', '\u{1D89D}', '\u{1D89E}', '\u{1D89F}', '\u{1D8A0}', '\u{1D8A1}', '\u{1D8A2}', + '\u{1D8A3}', '\u{1D8A4}', '\u{1D8A5}', '\u{1D8A6}', '\u{1D8A7}', '\u{1D8A8}', '\u{1D8A9}', '\u{1D8AA}', + '\u{1D8AB}', '\u{1D8AC}', '\u{1D8AD}', '\u{1D8AE}', '\u{1D8AF}', '\u{1D8B0}', '\u{1D8B1}', '\u{1D8B2}', + '\u{1D8B3}', '\u{1D8B4}', '\u{1D8B5}', '\u{1D8B6}', '\u{1D8B7}', '\u{1D8B8}', '\u{1D8B9}', '\u{1D8BA}', + '\u{1D8BB}', '\u{1D8BC}', '\u{1D8BD}', '\u{1D8BE}', '\u{1D8BF}', '\u{1D8C0}', '\u{1D8C1}', '\u{1D8C2}', + '\u{1D8C3}', '\u{1D8C4}', '\u{1D8C5}', '\u{1D8C6}', '\u{1D8C7}', '\u{1D8C8}', '\u{1D8C9}', '\u{1D8CA}', + '\u{1D8CB}', '\u{1D8CC}', '\u{1D8CD}', '\u{1D8CE}', '\u{1D8CF}', '\u{1D8D0}', '\u{1D8D1}', '\u{1D8D2}', + '\u{1D8D3}', '\u{1D8D4}', '\u{1D8D5}', '\u{1D8D6}', '\u{1D8D7}', '\u{1D8D8}', '\u{1D8D9}', '\u{1D8DA}', + '\u{1D8DB}', '\u{1D8DC}', '\u{1D8DD}', '\u{1D8DE}', '\u{1D8DF}', '\u{1D8E0}', '\u{1D8E1}', '\u{1D8E2}', + '\u{1D8E3}', '\u{1D8E4}', '\u{1D8E5}', '\u{1D8E6}', '\u{1D8E7}', '\u{1D8E8}', '\u{1D8E9}', '\u{1D8EA}', + '\u{1D8EB}', '\u{1D8EC}', '\u{1D8ED}', '\u{1D8EE}', '\u{1D8EF}', '\u{1D8F0}', '\u{1D8F1}', '\u{1D8F2}', + '\u{1D8F3}', '\u{1D8F4}', '\u{1D8F5}', '\u{1D8F6}', '\u{1D8F7}', '\u{1D8F8}', '\u{1D8F9}', '\u{1D8FA}', + '\u{1D8FB}', '\u{1D8FC}', '\u{1D8FD}', '\u{1D8FE}', '\u{1D8FF}', '\u{1D900}', '\u{1D901}', '\u{1D902}', + '\u{1D903}', '\u{1D904}', '\u{1D905}', '\u{1D906}', '\u{1D907}', '\u{1D908}', '\u{1D909}', '\u{1D90A}', + '\u{1D90B}', '\u{1D90C}', '\u{1D90D}', '\u{1D90E}', '\u{1D90F}', '\u{1D910}', '\u{1D911}', '\u{1D912}', + '\u{1D913}', '\u{1D914}', '\u{1D915}', '\u{1D916}', '\u{1D917}', '\u{1D918}', '\u{1D919}', '\u{1D91A}', + '\u{1D91B}', '\u{1D91C}', '\u{1D91D}', '\u{1D91E}', '\u{1D91F}', '\u{1D920}', '\u{1D921}', '\u{1D922}', + '\u{1D923}', '\u{1D924}', '\u{1D925}', '\u{1D926}', '\u{1D927}', '\u{1D928}', '\u{1D929}', '\u{1D92A}', + '\u{1D92B}', '\u{1D92C}', '\u{1D92D}', '\u{1D92E}', '\u{1D92F}', '\u{1D930}', '\u{1D931}', '\u{1D932}', + '\u{1D933}', '\u{1D934}', '\u{1D935}', '\u{1D936}', '\u{1D937}', '\u{1D938}', '\u{1D939}', '\u{1D93A}', + '\u{1D93B}', '\u{1D93C}', '\u{1D93D}', '\u{1D93E}', '\u{1D93F}', '\u{1D940}', '\u{1D941}', '\u{1D942}', + '\u{1D943}', '\u{1D944}', '\u{1D945}', '\u{1D946}', '\u{1D947}', '\u{1D948}', '\u{1D949}', '\u{1D94A}', + '\u{1D94B}', '\u{1D94C}', '\u{1D94D}', '\u{1D94E}', '\u{1D94F}', '\u{1D950}', '\u{1D951}', '\u{1D952}', + '\u{1D953}', '\u{1D954}', '\u{1D955}', '\u{1D956}', '\u{1D957}', '\u{1D958}', '\u{1D959}', '\u{1D95A}', + '\u{1D95B}', '\u{1D95C}', '\u{1D95D}', '\u{1D95E}', '\u{1D95F}', '\u{1D960}', '\u{1D961}', '\u{1D962}', + '\u{1D963}', '\u{1D964}', '\u{1D965}', '\u{1D966}', '\u{1D967}', '\u{1D968}', '\u{1D969}', '\u{1D96A}', + '\u{1D96B}', '\u{1D96C}', '\u{1D96D}', '\u{1D96E}', '\u{1D96F}', '\u{1D970}', '\u{1D971}', '\u{1D972}', + '\u{1D973}', '\u{1D974}', '\u{1D975}', '\u{1D976}', '\u{1D977}', '\u{1D978}', '\u{1D979}', '\u{1D97A}', + '\u{1D97B}', '\u{1D97C}', '\u{1D97D}', '\u{1D97E}', '\u{1D97F}', '\u{1D980}', '\u{1D981}', '\u{1D982}', + '\u{1D983}', '\u{1D984}', '\u{1D985}', '\u{1D986}', '\u{1D987}', '\u{1D988}', '\u{1D989}', '\u{1D98A}', + '\u{1D98B}', '\u{1D98C}', '\u{1D98D}', '\u{1D98E}', '\u{1D98F}', '\u{1D990}', '\u{1D991}', '\u{1D992}', + '\u{1D993}', '\u{1D994}', '\u{1D995}', '\u{1D996}', '\u{1D997}', '\u{1D998}', '\u{1D999}', '\u{1D99A}', + '\u{1D99B}', '\u{1D99C}', '\u{1D99D}', '\u{1D99E}', '\u{1D99F}', '\u{1D9A0}', '\u{1D9A1}', '\u{1D9A2}', + '\u{1D9A3}', '\u{1D9A4}', '\u{1D9A5}', '\u{1D9A6}', '\u{1D9A7}', '\u{1D9A8}', '\u{1D9A9}', '\u{1D9AA}', + '\u{1D9AB}', '\u{1D9AC}', '\u{1D9AD}', '\u{1D9AE}', '\u{1D9AF}', '\u{1D9B0}', '\u{1D9B1}', '\u{1D9B2}', + '\u{1D9B3}', '\u{1D9B4}', '\u{1D9B5}', '\u{1D9B6}', '\u{1D9B7}', '\u{1D9B8}', '\u{1D9B9}', '\u{1D9BA}', + '\u{1D9BB}', '\u{1D9BC}', '\u{1D9BD}', '\u{1D9BE}', '\u{1D9BF}', '\u{1D9C0}', '\u{1D9C1}', '\u{1D9C2}', + '\u{1D9C3}', '\u{1D9C4}', '\u{1D9C5}', '\u{1D9C6}', '\u{1D9C7}', '\u{1D9C8}', '\u{1D9C9}', '\u{1D9CA}', + '\u{1D9CB}', '\u{1D9CC}', '\u{1D9CD}', '\u{1D9CE}', '\u{1D9CF}', '\u{1D9D0}', '\u{1D9D1}', '\u{1D9D2}', + '\u{1D9D3}', '\u{1D9D4}', '\u{1D9D5}', '\u{1D9D6}', '\u{1D9D7}', '\u{1D9D8}', '\u{1D9D9}', '\u{1D9DA}', + '\u{1D9DB}', '\u{1D9DC}', '\u{1D9DD}', '\u{1D9DE}', '\u{1D9DF}', '\u{1D9E0}', '\u{1D9E1}', '\u{1D9E2}', + '\u{1D9E3}', '\u{1D9E4}', '\u{1D9E5}', '\u{1D9E6}', '\u{1D9E7}', '\u{1D9E8}', '\u{1D9E9}', '\u{1D9EA}', + '\u{1D9EB}', '\u{1D9EC}', '\u{1D9ED}', '\u{1D9EE}', '\u{1D9EF}', '\u{1D9F0}', '\u{1D9F1}', '\u{1D9F2}', + '\u{1D9F3}', '\u{1D9F4}', '\u{1D9F5}', '\u{1D9F6}', '\u{1D9F7}', '\u{1D9F8}', '\u{1D9F9}', '\u{1D9FA}', + '\u{1D9FB}', '\u{1D9FC}', '\u{1D9FD}', '\u{1D9FE}', '\u{1D9FF}', '\u{1DA37}', '\u{1DA38}', '\u{1DA39}', + '\u{1DA3A}', '\u{1DA6D}', '\u{1DA6E}', '\u{1DA6F}', '\u{1DA70}', '\u{1DA71}', '\u{1DA72}', '\u{1DA73}', + '\u{1DA74}', '\u{1DA76}', '\u{1DA77}', '\u{1DA78}', '\u{1DA79}', '\u{1DA7A}', '\u{1DA7B}', '\u{1DA7C}', + '\u{1DA7D}', '\u{1DA7E}', '\u{1DA7F}', '\u{1DA80}', '\u{1DA81}', '\u{1DA82}', '\u{1DA83}', '\u{1DA85}', + '\u{1DA86}', '\u{1E14F}', '\u{1ECAC}', '\u{1ED2E}', '\u{1F000}', '\u{1F001}', '\u{1F002}', '\u{1F003}', + '\u{1F004}', '\u{1F005}', '\u{1F006}', '\u{1F007}', '\u{1F008}', '\u{1F009}', '\u{1F00A}', '\u{1F00B}', + '\u{1F00C}', '\u{1F00D}', '\u{1F00E}', '\u{1F00F}', '\u{1F010}', '\u{1F011}', '\u{1F012}', '\u{1F013}', + '\u{1F014}', '\u{1F015}', '\u{1F016}', '\u{1F017}', '\u{1F018}', '\u{1F019}', '\u{1F01A}', '\u{1F01B}', + '\u{1F01C}', '\u{1F01D}', '\u{1F01E}', '\u{1F01F}', '\u{1F020}', '\u{1F021}', '\u{1F022}', '\u{1F023}', + '\u{1F024}', '\u{1F025}', '\u{1F026}', '\u{1F027}', '\u{1F028}', '\u{1F029}', '\u{1F02A}', '\u{1F02B}', + '\u{1F030}', '\u{1F031}', '\u{1F032}', '\u{1F033}', '\u{1F034}', '\u{1F035}', '\u{1F036}', '\u{1F037}', + '\u{1F038}', '\u{1F039}', '\u{1F03A}', '\u{1F03B}', '\u{1F03C}', '\u{1F03D}', '\u{1F03E}', '\u{1F03F}', + '\u{1F040}', '\u{1F041}', '\u{1F042}', '\u{1F043}', '\u{1F044}', '\u{1F045}', '\u{1F046}', '\u{1F047}', + '\u{1F048}', '\u{1F049}', '\u{1F04A}', '\u{1F04B}', '\u{1F04C}', '\u{1F04D}', '\u{1F04E}', '\u{1F04F}', + '\u{1F050}', '\u{1F051}', '\u{1F052}', '\u{1F053}', '\u{1F054}', '\u{1F055}', '\u{1F056}', '\u{1F057}', + '\u{1F058}', '\u{1F059}', '\u{1F05A}', '\u{1F05B}', '\u{1F05C}', '\u{1F05D}', '\u{1F05E}', '\u{1F05F}', + '\u{1F060}', '\u{1F061}', '\u{1F062}', '\u{1F063}', '\u{1F064}', '\u{1F065}', '\u{1F066}', '\u{1F067}', + '\u{1F068}', '\u{1F069}', '\u{1F06A}', '\u{1F06B}', '\u{1F06C}', '\u{1F06D}', '\u{1F06E}', '\u{1F06F}', + '\u{1F070}', '\u{1F071}', '\u{1F072}', '\u{1F073}', '\u{1F074}', '\u{1F075}', '\u{1F076}', '\u{1F077}', + '\u{1F078}', '\u{1F079}', '\u{1F07A}', '\u{1F07B}', '\u{1F07C}', '\u{1F07D}', '\u{1F07E}', '\u{1F07F}', + '\u{1F080}', '\u{1F081}', '\u{1F082}', '\u{1F083}', '\u{1F084}', '\u{1F085}', '\u{1F086}', '\u{1F087}', + '\u{1F088}', '\u{1F089}', '\u{1F08A}', '\u{1F08B}', '\u{1F08C}', '\u{1F08D}', '\u{1F08E}', '\u{1F08F}', + '\u{1F090}', '\u{1F091}', '\u{1F092}', '\u{1F093}', '\u{1F0A0}', '\u{1F0A1}', '\u{1F0A2}', '\u{1F0A3}', + '\u{1F0A4}', '\u{1F0A5}', '\u{1F0A6}', '\u{1F0A7}', '\u{1F0A8}', '\u{1F0A9}', '\u{1F0AA}', '\u{1F0AB}', + '\u{1F0AC}', '\u{1F0AD}', '\u{1F0AE}', '\u{1F0B1}', '\u{1F0B2}', '\u{1F0B3}', '\u{1F0B4}', '\u{1F0B5}', + '\u{1F0B6}', '\u{1F0B7}', '\u{1F0B8}', '\u{1F0B9}', '\u{1F0BA}', '\u{1F0BB}', '\u{1F0BC}', '\u{1F0BD}', + '\u{1F0BE}', '\u{1F0BF}', '\u{1F0C1}', '\u{1F0C2}', '\u{1F0C3}', '\u{1F0C4}', '\u{1F0C5}', '\u{1F0C6}', + '\u{1F0C7}', '\u{1F0C8}', '\u{1F0C9}', '\u{1F0CA}', '\u{1F0CB}', '\u{1F0CC}', '\u{1F0CD}', '\u{1F0CE}', + '\u{1F0CF}', '\u{1F0D1}', '\u{1F0D2}', '\u{1F0D3}', '\u{1F0D4}', '\u{1F0D5}', '\u{1F0D6}', '\u{1F0D7}', + '\u{1F0D8}', '\u{1F0D9}', '\u{1F0DA}', '\u{1F0DB}', '\u{1F0DC}', '\u{1F0DD}', '\u{1F0DE}', '\u{1F0DF}', + '\u{1F0E0}', '\u{1F0E1}', '\u{1F0E2}', '\u{1F0E3}', '\u{1F0E4}', '\u{1F0E5}', '\u{1F0E6}', '\u{1F0E7}', + '\u{1F0E8}', '\u{1F0E9}', '\u{1F0EA}', '\u{1F0EB}', '\u{1F0EC}', '\u{1F0ED}', '\u{1F0EE}', '\u{1F0EF}', + '\u{1F0F0}', '\u{1F0F1}', '\u{1F0F2}', '\u{1F0F3}', '\u{1F0F4}', '\u{1F0F5}', '\u{1F10D}', '\u{1F10E}', + '\u{1F10F}', '\u{1F110}', '\u{1F111}', '\u{1F112}', '\u{1F113}', '\u{1F114}', '\u{1F115}', '\u{1F116}', + '\u{1F117}', '\u{1F118}', '\u{1F119}', '\u{1F11A}', '\u{1F11B}', '\u{1F11C}', '\u{1F11D}', '\u{1F11E}', + '\u{1F11F}', '\u{1F120}', '\u{1F121}', '\u{1F122}', '\u{1F123}', '\u{1F124}', '\u{1F125}', '\u{1F126}', + '\u{1F127}', '\u{1F128}', '\u{1F129}', '\u{1F12A}', '\u{1F12B}', '\u{1F12C}', '\u{1F12D}', '\u{1F12E}', + '\u{1F12F}', '\u{1F130}', '\u{1F131}', '\u{1F132}', '\u{1F133}', '\u{1F134}', '\u{1F135}', '\u{1F136}', + '\u{1F137}', '\u{1F138}', '\u{1F139}', '\u{1F13A}', '\u{1F13B}', '\u{1F13C}', '\u{1F13D}', '\u{1F13E}', + '\u{1F13F}', '\u{1F140}', '\u{1F141}', '\u{1F142}', '\u{1F143}', '\u{1F144}', '\u{1F145}', '\u{1F146}', + '\u{1F147}', '\u{1F148}', '\u{1F149}', '\u{1F14A}', '\u{1F14B}', '\u{1F14C}', '\u{1F14D}', '\u{1F14E}', + '\u{1F14F}', '\u{1F150}', '\u{1F151}', '\u{1F152}', '\u{1F153}', '\u{1F154}', '\u{1F155}', '\u{1F156}', + '\u{1F157}', '\u{1F158}', '\u{1F159}', '\u{1F15A}', '\u{1F15B}', '\u{1F15C}', '\u{1F15D}', '\u{1F15E}', + '\u{1F15F}', '\u{1F160}', '\u{1F161}', '\u{1F162}', '\u{1F163}', '\u{1F164}', '\u{1F165}', '\u{1F166}', + '\u{1F167}', '\u{1F168}', '\u{1F169}', '\u{1F16A}', '\u{1F16B}', '\u{1F16C}', '\u{1F16D}', '\u{1F16E}', + '\u{1F16F}', '\u{1F170}', '\u{1F171}', '\u{1F172}', '\u{1F173}', '\u{1F174}', '\u{1F175}', '\u{1F176}', + '\u{1F177}', '\u{1F178}', '\u{1F179}', '\u{1F17A}', '\u{1F17B}', '\u{1F17C}', '\u{1F17D}', '\u{1F17E}', + '\u{1F17F}', '\u{1F180}', '\u{1F181}', '\u{1F182}', '\u{1F183}', '\u{1F184}', '\u{1F185}', '\u{1F186}', + '\u{1F187}', '\u{1F188}', '\u{1F189}', '\u{1F18A}', '\u{1F18B}', '\u{1F18C}', '\u{1F18D}', '\u{1F18E}', + '\u{1F18F}', '\u{1F190}', '\u{1F191}', '\u{1F192}', '\u{1F193}', '\u{1F194}', '\u{1F195}', '\u{1F196}', + '\u{1F197}', '\u{1F198}', '\u{1F199}', '\u{1F19A}', '\u{1F19B}', '\u{1F19C}', '\u{1F19D}', '\u{1F19E}', + '\u{1F19F}', '\u{1F1A0}', '\u{1F1A1}', '\u{1F1A2}', '\u{1F1A3}', '\u{1F1A4}', '\u{1F1A5}', '\u{1F1A6}', + '\u{1F1A7}', '\u{1F1A8}', '\u{1F1A9}', '\u{1F1AA}', '\u{1F1AB}', '\u{1F1AC}', '\u{1F1AD}', '\u{1F1E6}', + '\u{1F1E7}', '\u{1F1E8}', '\u{1F1E9}', '\u{1F1EA}', '\u{1F1EB}', '\u{1F1EC}', '\u{1F1ED}', '\u{1F1EE}', + '\u{1F1EF}', '\u{1F1F0}', '\u{1F1F1}', '\u{1F1F2}', '\u{1F1F3}', '\u{1F1F4}', '\u{1F1F5}', '\u{1F1F6}', + '\u{1F1F7}', '\u{1F1F8}', '\u{1F1F9}', '\u{1F1FA}', '\u{1F1FB}', '\u{1F1FC}', '\u{1F1FD}', '\u{1F1FE}', + '\u{1F1FF}', '\u{1F200}', '\u{1F201}', '\u{1F202}', '\u{1F210}', '\u{1F211}', '\u{1F212}', '\u{1F213}', + '\u{1F214}', '\u{1F215}', '\u{1F216}', '\u{1F217}', '\u{1F218}', '\u{1F219}', '\u{1F21A}', '\u{1F21B}', + '\u{1F21C}', '\u{1F21D}', '\u{1F21E}', '\u{1F21F}', '\u{1F220}', '\u{1F221}', '\u{1F222}', '\u{1F223}', + '\u{1F224}', '\u{1F225}', '\u{1F226}', '\u{1F227}', '\u{1F228}', '\u{1F229}', '\u{1F22A}', '\u{1F22B}', + '\u{1F22C}', '\u{1F22D}', '\u{1F22E}', '\u{1F22F}', '\u{1F230}', '\u{1F231}', '\u{1F232}', '\u{1F233}', + '\u{1F234}', '\u{1F235}', '\u{1F236}', '\u{1F237}', '\u{1F238}', '\u{1F239}', '\u{1F23A}', '\u{1F23B}', + '\u{1F240}', '\u{1F241}', '\u{1F242}', '\u{1F243}', '\u{1F244}', '\u{1F245}', '\u{1F246}', '\u{1F247}', + '\u{1F248}', '\u{1F250}', '\u{1F251}', '\u{1F260}', '\u{1F261}', '\u{1F262}', '\u{1F263}', '\u{1F264}', + '\u{1F265}', '\u{1F300}', '\u{1F301}', '\u{1F302}', '\u{1F303}', '\u{1F304}', '\u{1F305}', '\u{1F306}', + '\u{1F307}', '\u{1F308}', '\u{1F309}', '\u{1F30A}', '\u{1F30B}', '\u{1F30C}', '\u{1F30D}', '\u{1F30E}', + '\u{1F30F}', '\u{1F310}', '\u{1F311}', '\u{1F312}', '\u{1F313}', '\u{1F314}', '\u{1F315}', '\u{1F316}', + '\u{1F317}', '\u{1F318}', '\u{1F319}', '\u{1F31A}', '\u{1F31B}', '\u{1F31C}', '\u{1F31D}', '\u{1F31E}', + '\u{1F31F}', '\u{1F320}', '\u{1F321}', '\u{1F322}', '\u{1F323}', '\u{1F324}', '\u{1F325}', '\u{1F326}', + '\u{1F327}', '\u{1F328}', '\u{1F329}', '\u{1F32A}', '\u{1F32B}', '\u{1F32C}', '\u{1F32D}', '\u{1F32E}', + '\u{1F32F}', '\u{1F330}', '\u{1F331}', '\u{1F332}', '\u{1F333}', '\u{1F334}', '\u{1F335}', '\u{1F336}', + '\u{1F337}', '\u{1F338}', '\u{1F339}', '\u{1F33A}', '\u{1F33B}', '\u{1F33C}', '\u{1F33D}', '\u{1F33E}', + '\u{1F33F}', '\u{1F340}', '\u{1F341}', '\u{1F342}', '\u{1F343}', '\u{1F344}', '\u{1F345}', '\u{1F346}', + '\u{1F347}', '\u{1F348}', '\u{1F349}', '\u{1F34A}', '\u{1F34B}', '\u{1F34C}', '\u{1F34D}', '\u{1F34E}', + '\u{1F34F}', '\u{1F350}', '\u{1F351}', '\u{1F352}', '\u{1F353}', '\u{1F354}', '\u{1F355}', '\u{1F356}', + '\u{1F357}', '\u{1F358}', '\u{1F359}', '\u{1F35A}', '\u{1F35B}', '\u{1F35C}', '\u{1F35D}', '\u{1F35E}', + '\u{1F35F}', '\u{1F360}', '\u{1F361}', '\u{1F362}', '\u{1F363}', '\u{1F364}', '\u{1F365}', '\u{1F366}', + '\u{1F367}', '\u{1F368}', '\u{1F369}', '\u{1F36A}', '\u{1F36B}', '\u{1F36C}', '\u{1F36D}', '\u{1F36E}', + '\u{1F36F}', '\u{1F370}', '\u{1F371}', '\u{1F372}', '\u{1F373}', '\u{1F374}', '\u{1F375}', '\u{1F376}', + '\u{1F377}', '\u{1F378}', '\u{1F379}', '\u{1F37A}', '\u{1F37B}', '\u{1F37C}', '\u{1F37D}', '\u{1F37E}', + '\u{1F37F}', '\u{1F380}', '\u{1F381}', '\u{1F382}', '\u{1F383}', '\u{1F384}', '\u{1F385}', '\u{1F386}', + '\u{1F387}', '\u{1F388}', '\u{1F389}', '\u{1F38A}', '\u{1F38B}', '\u{1F38C}', '\u{1F38D}', '\u{1F38E}', + '\u{1F38F}', '\u{1F390}', '\u{1F391}', '\u{1F392}', '\u{1F393}', '\u{1F394}', '\u{1F395}', '\u{1F396}', + '\u{1F397}', '\u{1F398}', '\u{1F399}', '\u{1F39A}', '\u{1F39B}', '\u{1F39C}', '\u{1F39D}', '\u{1F39E}', + '\u{1F39F}', '\u{1F3A0}', '\u{1F3A1}', '\u{1F3A2}', '\u{1F3A3}', '\u{1F3A4}', '\u{1F3A5}', '\u{1F3A6}', + '\u{1F3A7}', '\u{1F3A8}', '\u{1F3A9}', '\u{1F3AA}', '\u{1F3AB}', '\u{1F3AC}', '\u{1F3AD}', '\u{1F3AE}', + '\u{1F3AF}', '\u{1F3B0}', '\u{1F3B1}', '\u{1F3B2}', '\u{1F3B3}', '\u{1F3B4}', '\u{1F3B5}', '\u{1F3B6}', + '\u{1F3B7}', '\u{1F3B8}', '\u{1F3B9}', '\u{1F3BA}', '\u{1F3BB}', '\u{1F3BC}', '\u{1F3BD}', '\u{1F3BE}', + '\u{1F3BF}', '\u{1F3C0}', '\u{1F3C1}', '\u{1F3C2}', '\u{1F3C3}', '\u{1F3C4}', '\u{1F3C5}', '\u{1F3C6}', + '\u{1F3C7}', '\u{1F3C8}', '\u{1F3C9}', '\u{1F3CA}', '\u{1F3CB}', '\u{1F3CC}', '\u{1F3CD}', '\u{1F3CE}', + '\u{1F3CF}', '\u{1F3D0}', '\u{1F3D1}', '\u{1F3D2}', '\u{1F3D3}', '\u{1F3D4}', '\u{1F3D5}', '\u{1F3D6}', + '\u{1F3D7}', '\u{1F3D8}', '\u{1F3D9}', '\u{1F3DA}', '\u{1F3DB}', '\u{1F3DC}', '\u{1F3DD}', '\u{1F3DE}', + '\u{1F3DF}', '\u{1F3E0}', '\u{1F3E1}', '\u{1F3E2}', '\u{1F3E3}', '\u{1F3E4}', '\u{1F3E5}', '\u{1F3E6}', + '\u{1F3E7}', '\u{1F3E8}', '\u{1F3E9}', '\u{1F3EA}', '\u{1F3EB}', '\u{1F3EC}', '\u{1F3ED}', '\u{1F3EE}', + '\u{1F3EF}', '\u{1F3F0}', '\u{1F3F1}', '\u{1F3F2}', '\u{1F3F3}', '\u{1F3F4}', '\u{1F3F5}', '\u{1F3F6}', + '\u{1F3F7}', '\u{1F3F8}', '\u{1F3F9}', '\u{1F3FA}', '\u{1F400}', '\u{1F401}', '\u{1F402}', '\u{1F403}', + '\u{1F404}', '\u{1F405}', '\u{1F406}', '\u{1F407}', '\u{1F408}', '\u{1F409}', '\u{1F40A}', '\u{1F40B}', + '\u{1F40C}', '\u{1F40D}', '\u{1F40E}', '\u{1F40F}', '\u{1F410}', '\u{1F411}', '\u{1F412}', '\u{1F413}', + '\u{1F414}', '\u{1F415}', '\u{1F416}', '\u{1F417}', '\u{1F418}', '\u{1F419}', '\u{1F41A}', '\u{1F41B}', + '\u{1F41C}', '\u{1F41D}', '\u{1F41E}', '\u{1F41F}', '\u{1F420}', '\u{1F421}', '\u{1F422}', '\u{1F423}', + '\u{1F424}', '\u{1F425}', '\u{1F426}', '\u{1F427}', '\u{1F428}', '\u{1F429}', '\u{1F42A}', '\u{1F42B}', + '\u{1F42C}', '\u{1F42D}', '\u{1F42E}', '\u{1F42F}', '\u{1F430}', '\u{1F431}', '\u{1F432}', '\u{1F433}', + '\u{1F434}', '\u{1F435}', '\u{1F436}', '\u{1F437}', '\u{1F438}', '\u{1F439}', '\u{1F43A}', '\u{1F43B}', + '\u{1F43C}', '\u{1F43D}', '\u{1F43E}', '\u{1F43F}', '\u{1F440}', '\u{1F441}', '\u{1F442}', '\u{1F443}', + '\u{1F444}', '\u{1F445}', '\u{1F446}', '\u{1F447}', '\u{1F448}', '\u{1F449}', '\u{1F44A}', '\u{1F44B}', + '\u{1F44C}', '\u{1F44D}', '\u{1F44E}', '\u{1F44F}', '\u{1F450}', '\u{1F451}', '\u{1F452}', '\u{1F453}', + '\u{1F454}', '\u{1F455}', '\u{1F456}', '\u{1F457}', '\u{1F458}', '\u{1F459}', '\u{1F45A}', '\u{1F45B}', + '\u{1F45C}', '\u{1F45D}', '\u{1F45E}', '\u{1F45F}', '\u{1F460}', '\u{1F461}', '\u{1F462}', '\u{1F463}', + '\u{1F464}', '\u{1F465}', '\u{1F466}', '\u{1F467}', '\u{1F468}', '\u{1F469}', '\u{1F46A}', '\u{1F46B}', + '\u{1F46C}', '\u{1F46D}', '\u{1F46E}', '\u{1F46F}', '\u{1F470}', '\u{1F471}', '\u{1F472}', '\u{1F473}', + '\u{1F474}', '\u{1F475}', '\u{1F476}', '\u{1F477}', '\u{1F478}', '\u{1F479}', '\u{1F47A}', '\u{1F47B}', + '\u{1F47C}', '\u{1F47D}', '\u{1F47E}', '\u{1F47F}', '\u{1F480}', '\u{1F481}', '\u{1F482}', '\u{1F483}', + '\u{1F484}', '\u{1F485}', '\u{1F486}', '\u{1F487}', '\u{1F488}', '\u{1F489}', '\u{1F48A}', '\u{1F48B}', + '\u{1F48C}', '\u{1F48D}', '\u{1F48E}', '\u{1F48F}', '\u{1F490}', '\u{1F491}', '\u{1F492}', '\u{1F493}', + '\u{1F494}', '\u{1F495}', '\u{1F496}', '\u{1F497}', '\u{1F498}', '\u{1F499}', '\u{1F49A}', '\u{1F49B}', + '\u{1F49C}', '\u{1F49D}', '\u{1F49E}', '\u{1F49F}', '\u{1F4A0}', '\u{1F4A1}', '\u{1F4A2}', '\u{1F4A3}', + '\u{1F4A4}', '\u{1F4A5}', '\u{1F4A6}', '\u{1F4A7}', '\u{1F4A8}', '\u{1F4A9}', '\u{1F4AA}', '\u{1F4AB}', + '\u{1F4AC}', '\u{1F4AD}', '\u{1F4AE}', '\u{1F4AF}', '\u{1F4B0}', '\u{1F4B1}', '\u{1F4B2}', '\u{1F4B3}', + '\u{1F4B4}', '\u{1F4B5}', '\u{1F4B6}', '\u{1F4B7}', '\u{1F4B8}', '\u{1F4B9}', '\u{1F4BA}', '\u{1F4BB}', + '\u{1F4BC}', '\u{1F4BD}', '\u{1F4BE}', '\u{1F4BF}', '\u{1F4C0}', '\u{1F4C1}', '\u{1F4C2}', '\u{1F4C3}', + '\u{1F4C4}', '\u{1F4C5}', '\u{1F4C6}', '\u{1F4C7}', '\u{1F4C8}', '\u{1F4C9}', '\u{1F4CA}', '\u{1F4CB}', + '\u{1F4CC}', '\u{1F4CD}', '\u{1F4CE}', '\u{1F4CF}', '\u{1F4D0}', '\u{1F4D1}', '\u{1F4D2}', '\u{1F4D3}', + '\u{1F4D4}', '\u{1F4D5}', '\u{1F4D6}', '\u{1F4D7}', '\u{1F4D8}', '\u{1F4D9}', '\u{1F4DA}', '\u{1F4DB}', + '\u{1F4DC}', '\u{1F4DD}', '\u{1F4DE}', '\u{1F4DF}', '\u{1F4E0}', '\u{1F4E1}', '\u{1F4E2}', '\u{1F4E3}', + '\u{1F4E4}', '\u{1F4E5}', '\u{1F4E6}', '\u{1F4E7}', '\u{1F4E8}', '\u{1F4E9}', '\u{1F4EA}', '\u{1F4EB}', + '\u{1F4EC}', '\u{1F4ED}', '\u{1F4EE}', '\u{1F4EF}', '\u{1F4F0}', '\u{1F4F1}', '\u{1F4F2}', '\u{1F4F3}', + '\u{1F4F4}', '\u{1F4F5}', '\u{1F4F6}', '\u{1F4F7}', '\u{1F4F8}', '\u{1F4F9}', '\u{1F4FA}', '\u{1F4FB}', + '\u{1F4FC}', '\u{1F4FD}', '\u{1F4FE}', '\u{1F4FF}', '\u{1F500}', '\u{1F501}', '\u{1F502}', '\u{1F503}', + '\u{1F504}', '\u{1F505}', '\u{1F506}', '\u{1F507}', '\u{1F508}', '\u{1F509}', '\u{1F50A}', '\u{1F50B}', + '\u{1F50C}', '\u{1F50D}', '\u{1F50E}', '\u{1F50F}', '\u{1F510}', '\u{1F511}', '\u{1F512}', '\u{1F513}', + '\u{1F514}', '\u{1F515}', '\u{1F516}', '\u{1F517}', '\u{1F518}', '\u{1F519}', '\u{1F51A}', '\u{1F51B}', + '\u{1F51C}', '\u{1F51D}', '\u{1F51E}', '\u{1F51F}', '\u{1F520}', '\u{1F521}', '\u{1F522}', '\u{1F523}', + '\u{1F524}', '\u{1F525}', '\u{1F526}', '\u{1F527}', '\u{1F528}', '\u{1F529}', '\u{1F52A}', '\u{1F52B}', + '\u{1F52C}', '\u{1F52D}', '\u{1F52E}', '\u{1F52F}', '\u{1F530}', '\u{1F531}', '\u{1F532}', '\u{1F533}', + '\u{1F534}', '\u{1F535}', '\u{1F536}', '\u{1F537}', '\u{1F538}', '\u{1F539}', '\u{1F53A}', '\u{1F53B}', + '\u{1F53C}', '\u{1F53D}', '\u{1F53E}', '\u{1F53F}', '\u{1F540}', '\u{1F541}', '\u{1F542}', '\u{1F543}', + '\u{1F544}', '\u{1F545}', '\u{1F546}', '\u{1F547}', '\u{1F548}', '\u{1F549}', '\u{1F54A}', '\u{1F54B}', + '\u{1F54C}', '\u{1F54D}', '\u{1F54E}', '\u{1F54F}', '\u{1F550}', '\u{1F551}', '\u{1F552}', '\u{1F553}', + '\u{1F554}', '\u{1F555}', '\u{1F556}', '\u{1F557}', '\u{1F558}', '\u{1F559}', '\u{1F55A}', '\u{1F55B}', + '\u{1F55C}', '\u{1F55D}', '\u{1F55E}', '\u{1F55F}', '\u{1F560}', '\u{1F561}', '\u{1F562}', '\u{1F563}', + '\u{1F564}', '\u{1F565}', '\u{1F566}', '\u{1F567}', '\u{1F568}', '\u{1F569}', '\u{1F56A}', '\u{1F56B}', + '\u{1F56C}', '\u{1F56D}', '\u{1F56E}', '\u{1F56F}', '\u{1F570}', '\u{1F571}', '\u{1F572}', '\u{1F573}', + '\u{1F574}', '\u{1F575}', '\u{1F576}', '\u{1F577}', '\u{1F578}', '\u{1F579}', '\u{1F57A}', '\u{1F57B}', + '\u{1F57C}', '\u{1F57D}', '\u{1F57E}', '\u{1F57F}', '\u{1F580}', '\u{1F581}', '\u{1F582}', '\u{1F583}', + '\u{1F584}', '\u{1F585}', '\u{1F586}', '\u{1F587}', '\u{1F588}', '\u{1F589}', '\u{1F58A}', '\u{1F58B}', + '\u{1F58C}', '\u{1F58D}', '\u{1F58E}', '\u{1F58F}', '\u{1F590}', '\u{1F591}', '\u{1F592}', '\u{1F593}', + '\u{1F594}', '\u{1F595}', '\u{1F596}', '\u{1F597}', '\u{1F598}', '\u{1F599}', '\u{1F59A}', '\u{1F59B}', + '\u{1F59C}', '\u{1F59D}', '\u{1F59E}', '\u{1F59F}', '\u{1F5A0}', '\u{1F5A1}', '\u{1F5A2}', '\u{1F5A3}', + '\u{1F5A4}', '\u{1F5A5}', '\u{1F5A6}', '\u{1F5A7}', '\u{1F5A8}', '\u{1F5A9}', '\u{1F5AA}', '\u{1F5AB}', + '\u{1F5AC}', '\u{1F5AD}', '\u{1F5AE}', '\u{1F5AF}', '\u{1F5B0}', '\u{1F5B1}', '\u{1F5B2}', '\u{1F5B3}', + '\u{1F5B4}', '\u{1F5B5}', '\u{1F5B6}', '\u{1F5B7}', '\u{1F5B8}', '\u{1F5B9}', '\u{1F5BA}', '\u{1F5BB}', + '\u{1F5BC}', '\u{1F5BD}', '\u{1F5BE}', '\u{1F5BF}', '\u{1F5C0}', '\u{1F5C1}', '\u{1F5C2}', '\u{1F5C3}', + '\u{1F5C4}', '\u{1F5C5}', '\u{1F5C6}', '\u{1F5C7}', '\u{1F5C8}', '\u{1F5C9}', '\u{1F5CA}', '\u{1F5CB}', + '\u{1F5CC}', '\u{1F5CD}', '\u{1F5CE}', '\u{1F5CF}', '\u{1F5D0}', '\u{1F5D1}', '\u{1F5D2}', '\u{1F5D3}', + '\u{1F5D4}', '\u{1F5D5}', '\u{1F5D6}', '\u{1F5D7}', '\u{1F5D8}', '\u{1F5D9}', '\u{1F5DA}', '\u{1F5DB}', + '\u{1F5DC}', '\u{1F5DD}', '\u{1F5DE}', '\u{1F5DF}', '\u{1F5E0}', '\u{1F5E1}', '\u{1F5E2}', '\u{1F5E3}', + '\u{1F5E4}', '\u{1F5E5}', '\u{1F5E6}', '\u{1F5E7}', '\u{1F5E8}', '\u{1F5E9}', '\u{1F5EA}', '\u{1F5EB}', + '\u{1F5EC}', '\u{1F5ED}', '\u{1F5EE}', '\u{1F5EF}', '\u{1F5F0}', '\u{1F5F1}', '\u{1F5F2}', '\u{1F5F3}', + '\u{1F5F4}', '\u{1F5F5}', '\u{1F5F6}', '\u{1F5F7}', '\u{1F5F8}', '\u{1F5F9}', '\u{1F5FA}', '\u{1F5FB}', + '\u{1F5FC}', '\u{1F5FD}', '\u{1F5FE}', '\u{1F5FF}', '\u{1F600}', '\u{1F601}', '\u{1F602}', '\u{1F603}', + '\u{1F604}', '\u{1F605}', '\u{1F606}', '\u{1F607}', '\u{1F608}', '\u{1F609}', '\u{1F60A}', '\u{1F60B}', + '\u{1F60C}', '\u{1F60D}', '\u{1F60E}', '\u{1F60F}', '\u{1F610}', '\u{1F611}', '\u{1F612}', '\u{1F613}', + '\u{1F614}', '\u{1F615}', '\u{1F616}', '\u{1F617}', '\u{1F618}', '\u{1F619}', '\u{1F61A}', '\u{1F61B}', + '\u{1F61C}', '\u{1F61D}', '\u{1F61E}', '\u{1F61F}', '\u{1F620}', '\u{1F621}', '\u{1F622}', '\u{1F623}', + '\u{1F624}', '\u{1F625}', '\u{1F626}', '\u{1F627}', '\u{1F628}', '\u{1F629}', '\u{1F62A}', '\u{1F62B}', + '\u{1F62C}', '\u{1F62D}', '\u{1F62E}', '\u{1F62F}', '\u{1F630}', '\u{1F631}', '\u{1F632}', '\u{1F633}', + '\u{1F634}', '\u{1F635}', '\u{1F636}', '\u{1F637}', '\u{1F638}', '\u{1F639}', '\u{1F63A}', '\u{1F63B}', + '\u{1F63C}', '\u{1F63D}', '\u{1F63E}', '\u{1F63F}', '\u{1F640}', '\u{1F641}', '\u{1F642}', '\u{1F643}', + '\u{1F644}', '\u{1F645}', '\u{1F646}', '\u{1F647}', '\u{1F648}', '\u{1F649}', '\u{1F64A}', '\u{1F64B}', + '\u{1F64C}', '\u{1F64D}', '\u{1F64E}', '\u{1F64F}', '\u{1F650}', '\u{1F651}', '\u{1F652}', '\u{1F653}', + '\u{1F654}', '\u{1F655}', '\u{1F656}', '\u{1F657}', '\u{1F658}', '\u{1F659}', '\u{1F65A}', '\u{1F65B}', + '\u{1F65C}', '\u{1F65D}', '\u{1F65E}', '\u{1F65F}', '\u{1F660}', '\u{1F661}', '\u{1F662}', '\u{1F663}', + '\u{1F664}', '\u{1F665}', '\u{1F666}', '\u{1F667}', '\u{1F668}', '\u{1F669}', '\u{1F66A}', '\u{1F66B}', + '\u{1F66C}', '\u{1F66D}', '\u{1F66E}', '\u{1F66F}', '\u{1F670}', '\u{1F671}', '\u{1F672}', '\u{1F673}', + '\u{1F674}', '\u{1F675}', '\u{1F676}', '\u{1F677}', '\u{1F678}', '\u{1F679}', '\u{1F67A}', '\u{1F67B}', + '\u{1F67C}', '\u{1F67D}', '\u{1F67E}', '\u{1F67F}', '\u{1F680}', '\u{1F681}', '\u{1F682}', '\u{1F683}', + '\u{1F684}', '\u{1F685}', '\u{1F686}', '\u{1F687}', '\u{1F688}', '\u{1F689}', '\u{1F68A}', '\u{1F68B}', + '\u{1F68C}', '\u{1F68D}', '\u{1F68E}', '\u{1F68F}', '\u{1F690}', '\u{1F691}', '\u{1F692}', '\u{1F693}', + '\u{1F694}', '\u{1F695}', '\u{1F696}', '\u{1F697}', '\u{1F698}', '\u{1F699}', '\u{1F69A}', '\u{1F69B}', + '\u{1F69C}', '\u{1F69D}', '\u{1F69E}', '\u{1F69F}', '\u{1F6A0}', '\u{1F6A1}', '\u{1F6A2}', '\u{1F6A3}', + '\u{1F6A4}', '\u{1F6A5}', '\u{1F6A6}', '\u{1F6A7}', '\u{1F6A8}', '\u{1F6A9}', '\u{1F6AA}', '\u{1F6AB}', + '\u{1F6AC}', '\u{1F6AD}', '\u{1F6AE}', '\u{1F6AF}', '\u{1F6B0}', '\u{1F6B1}', '\u{1F6B2}', '\u{1F6B3}', + '\u{1F6B4}', '\u{1F6B5}', '\u{1F6B6}', '\u{1F6B7}', '\u{1F6B8}', '\u{1F6B9}', '\u{1F6BA}', '\u{1F6BB}', + '\u{1F6BC}', '\u{1F6BD}', '\u{1F6BE}', '\u{1F6BF}', '\u{1F6C0}', '\u{1F6C1}', '\u{1F6C2}', '\u{1F6C3}', + '\u{1F6C4}', '\u{1F6C5}', '\u{1F6C6}', '\u{1F6C7}', '\u{1F6C8}', '\u{1F6C9}', '\u{1F6CA}', '\u{1F6CB}', + '\u{1F6CC}', '\u{1F6CD}', '\u{1F6CE}', '\u{1F6CF}', '\u{1F6D0}', '\u{1F6D1}', '\u{1F6D2}', '\u{1F6D3}', + '\u{1F6D4}', '\u{1F6D5}', '\u{1F6D6}', '\u{1F6D7}', '\u{1F6DD}', '\u{1F6DE}', '\u{1F6DF}', '\u{1F6E0}', + '\u{1F6E1}', '\u{1F6E2}', '\u{1F6E3}', '\u{1F6E4}', '\u{1F6E5}', '\u{1F6E6}', '\u{1F6E7}', '\u{1F6E8}', + '\u{1F6E9}', '\u{1F6EA}', '\u{1F6EB}', '\u{1F6EC}', '\u{1F6F0}', '\u{1F6F1}', '\u{1F6F2}', '\u{1F6F3}', + '\u{1F6F4}', '\u{1F6F5}', '\u{1F6F6}', '\u{1F6F7}', '\u{1F6F8}', '\u{1F6F9}', '\u{1F6FA}', '\u{1F6FB}', + '\u{1F6FC}', '\u{1F700}', '\u{1F701}', '\u{1F702}', '\u{1F703}', '\u{1F704}', '\u{1F705}', '\u{1F706}', + '\u{1F707}', '\u{1F708}', '\u{1F709}', '\u{1F70A}', '\u{1F70B}', '\u{1F70C}', '\u{1F70D}', '\u{1F70E}', + '\u{1F70F}', '\u{1F710}', '\u{1F711}', '\u{1F712}', '\u{1F713}', '\u{1F714}', '\u{1F715}', '\u{1F716}', + '\u{1F717}', '\u{1F718}', '\u{1F719}', '\u{1F71A}', '\u{1F71B}', '\u{1F71C}', '\u{1F71D}', '\u{1F71E}', + '\u{1F71F}', '\u{1F720}', '\u{1F721}', '\u{1F722}', '\u{1F723}', '\u{1F724}', '\u{1F725}', '\u{1F726}', + '\u{1F727}', '\u{1F728}', '\u{1F729}', '\u{1F72A}', '\u{1F72B}', '\u{1F72C}', '\u{1F72D}', '\u{1F72E}', + '\u{1F72F}', '\u{1F730}', '\u{1F731}', '\u{1F732}', '\u{1F733}', '\u{1F734}', '\u{1F735}', '\u{1F736}', + '\u{1F737}', '\u{1F738}', '\u{1F739}', '\u{1F73A}', '\u{1F73B}', '\u{1F73C}', '\u{1F73D}', '\u{1F73E}', + '\u{1F73F}', '\u{1F740}', '\u{1F741}', '\u{1F742}', '\u{1F743}', '\u{1F744}', '\u{1F745}', '\u{1F746}', + '\u{1F747}', '\u{1F748}', '\u{1F749}', '\u{1F74A}', '\u{1F74B}', '\u{1F74C}', '\u{1F74D}', '\u{1F74E}', + '\u{1F74F}', '\u{1F750}', '\u{1F751}', '\u{1F752}', '\u{1F753}', '\u{1F754}', '\u{1F755}', '\u{1F756}', + '\u{1F757}', '\u{1F758}', '\u{1F759}', '\u{1F75A}', '\u{1F75B}', '\u{1F75C}', '\u{1F75D}', '\u{1F75E}', + '\u{1F75F}', '\u{1F760}', '\u{1F761}', '\u{1F762}', '\u{1F763}', '\u{1F764}', '\u{1F765}', '\u{1F766}', + '\u{1F767}', '\u{1F768}', '\u{1F769}', '\u{1F76A}', '\u{1F76B}', '\u{1F76C}', '\u{1F76D}', '\u{1F76E}', + '\u{1F76F}', '\u{1F770}', '\u{1F771}', '\u{1F772}', '\u{1F773}', '\u{1F780}', '\u{1F781}', '\u{1F782}', + '\u{1F783}', '\u{1F784}', '\u{1F785}', '\u{1F786}', '\u{1F787}', '\u{1F788}', '\u{1F789}', '\u{1F78A}', + '\u{1F78B}', '\u{1F78C}', '\u{1F78D}', '\u{1F78E}', '\u{1F78F}', '\u{1F790}', '\u{1F791}', '\u{1F792}', + '\u{1F793}', '\u{1F794}', '\u{1F795}', '\u{1F796}', '\u{1F797}', '\u{1F798}', '\u{1F799}', '\u{1F79A}', + '\u{1F79B}', '\u{1F79C}', '\u{1F79D}', '\u{1F79E}', '\u{1F79F}', '\u{1F7A0}', '\u{1F7A1}', '\u{1F7A2}', + '\u{1F7A3}', '\u{1F7A4}', '\u{1F7A5}', '\u{1F7A6}', '\u{1F7A7}', '\u{1F7A8}', '\u{1F7A9}', '\u{1F7AA}', + '\u{1F7AB}', '\u{1F7AC}', '\u{1F7AD}', '\u{1F7AE}', '\u{1F7AF}', '\u{1F7B0}', '\u{1F7B1}', '\u{1F7B2}', + '\u{1F7B3}', '\u{1F7B4}', '\u{1F7B5}', '\u{1F7B6}', '\u{1F7B7}', '\u{1F7B8}', '\u{1F7B9}', '\u{1F7BA}', + '\u{1F7BB}', '\u{1F7BC}', '\u{1F7BD}', '\u{1F7BE}', '\u{1F7BF}', '\u{1F7C0}', '\u{1F7C1}', '\u{1F7C2}', + '\u{1F7C3}', '\u{1F7C4}', '\u{1F7C5}', '\u{1F7C6}', '\u{1F7C7}', '\u{1F7C8}', '\u{1F7C9}', '\u{1F7CA}', + '\u{1F7CB}', '\u{1F7CC}', '\u{1F7CD}', '\u{1F7CE}', '\u{1F7CF}', '\u{1F7D0}', '\u{1F7D1}', '\u{1F7D2}', + '\u{1F7D3}', '\u{1F7D4}', '\u{1F7D5}', '\u{1F7D6}', '\u{1F7D7}', '\u{1F7D8}', '\u{1F7E0}', '\u{1F7E1}', + '\u{1F7E2}', '\u{1F7E3}', '\u{1F7E4}', '\u{1F7E5}', '\u{1F7E6}', '\u{1F7E7}', '\u{1F7E8}', '\u{1F7E9}', + '\u{1F7EA}', '\u{1F7EB}', '\u{1F7F0}', '\u{1F800}', '\u{1F801}', '\u{1F802}', '\u{1F803}', '\u{1F804}', + '\u{1F805}', '\u{1F806}', '\u{1F807}', '\u{1F808}', '\u{1F809}', '\u{1F80A}', '\u{1F80B}', '\u{1F810}', + '\u{1F811}', '\u{1F812}', '\u{1F813}', '\u{1F814}', '\u{1F815}', '\u{1F816}', '\u{1F817}', '\u{1F818}', + '\u{1F819}', '\u{1F81A}', '\u{1F81B}', '\u{1F81C}', '\u{1F81D}', '\u{1F81E}', '\u{1F81F}', '\u{1F820}', + '\u{1F821}', '\u{1F822}', '\u{1F823}', '\u{1F824}', '\u{1F825}', '\u{1F826}', '\u{1F827}', '\u{1F828}', + '\u{1F829}', '\u{1F82A}', '\u{1F82B}', '\u{1F82C}', '\u{1F82D}', '\u{1F82E}', '\u{1F82F}', '\u{1F830}', + '\u{1F831}', '\u{1F832}', '\u{1F833}', '\u{1F834}', '\u{1F835}', '\u{1F836}', '\u{1F837}', '\u{1F838}', + '\u{1F839}', '\u{1F83A}', '\u{1F83B}', '\u{1F83C}', '\u{1F83D}', '\u{1F83E}', '\u{1F83F}', '\u{1F840}', + '\u{1F841}', '\u{1F842}', '\u{1F843}', '\u{1F844}', '\u{1F845}', '\u{1F846}', '\u{1F847}', '\u{1F850}', + '\u{1F851}', '\u{1F852}', '\u{1F853}', '\u{1F854}', '\u{1F855}', '\u{1F856}', '\u{1F857}', '\u{1F858}', + '\u{1F859}', '\u{1F860}', '\u{1F861}', '\u{1F862}', '\u{1F863}', '\u{1F864}', '\u{1F865}', '\u{1F866}', + '\u{1F867}', '\u{1F868}', '\u{1F869}', '\u{1F86A}', '\u{1F86B}', '\u{1F86C}', '\u{1F86D}', '\u{1F86E}', + '\u{1F86F}', '\u{1F870}', '\u{1F871}', '\u{1F872}', '\u{1F873}', '\u{1F874}', '\u{1F875}', '\u{1F876}', + '\u{1F877}', '\u{1F878}', '\u{1F879}', '\u{1F87A}', '\u{1F87B}', '\u{1F87C}', '\u{1F87D}', '\u{1F87E}', + '\u{1F87F}', '\u{1F880}', '\u{1F881}', '\u{1F882}', '\u{1F883}', '\u{1F884}', '\u{1F885}', '\u{1F886}', + '\u{1F887}', '\u{1F890}', '\u{1F891}', '\u{1F892}', '\u{1F893}', '\u{1F894}', '\u{1F895}', '\u{1F896}', + '\u{1F897}', '\u{1F898}', '\u{1F899}', '\u{1F89A}', '\u{1F89B}', '\u{1F89C}', '\u{1F89D}', '\u{1F89E}', + '\u{1F89F}', '\u{1F8A0}', '\u{1F8A1}', '\u{1F8A2}', '\u{1F8A3}', '\u{1F8A4}', '\u{1F8A5}', '\u{1F8A6}', + '\u{1F8A7}', '\u{1F8A8}', '\u{1F8A9}', '\u{1F8AA}', '\u{1F8AB}', '\u{1F8AC}', '\u{1F8AD}', '\u{1F8B0}', + '\u{1F8B1}', '\u{1F900}', '\u{1F901}', '\u{1F902}', '\u{1F903}', '\u{1F904}', '\u{1F905}', '\u{1F906}', + '\u{1F907}', '\u{1F908}', '\u{1F909}', '\u{1F90A}', '\u{1F90B}', '\u{1F90C}', '\u{1F90D}', '\u{1F90E}', + '\u{1F90F}', '\u{1F910}', '\u{1F911}', '\u{1F912}', '\u{1F913}', '\u{1F914}', '\u{1F915}', '\u{1F916}', + '\u{1F917}', '\u{1F918}', '\u{1F919}', '\u{1F91A}', '\u{1F91B}', '\u{1F91C}', '\u{1F91D}', '\u{1F91E}', + '\u{1F91F}', '\u{1F920}', '\u{1F921}', '\u{1F922}', '\u{1F923}', '\u{1F924}', '\u{1F925}', '\u{1F926}', + '\u{1F927}', '\u{1F928}', '\u{1F929}', '\u{1F92A}', '\u{1F92B}', '\u{1F92C}', '\u{1F92D}', '\u{1F92E}', + '\u{1F92F}', '\u{1F930}', '\u{1F931}', '\u{1F932}', '\u{1F933}', '\u{1F934}', '\u{1F935}', '\u{1F936}', + '\u{1F937}', '\u{1F938}', '\u{1F939}', '\u{1F93A}', '\u{1F93B}', '\u{1F93C}', '\u{1F93D}', '\u{1F93E}', + '\u{1F93F}', '\u{1F940}', '\u{1F941}', '\u{1F942}', '\u{1F943}', '\u{1F944}', '\u{1F945}', '\u{1F946}', + '\u{1F947}', '\u{1F948}', '\u{1F949}', '\u{1F94A}', '\u{1F94B}', '\u{1F94C}', '\u{1F94D}', '\u{1F94E}', + '\u{1F94F}', '\u{1F950}', '\u{1F951}', '\u{1F952}', '\u{1F953}', '\u{1F954}', '\u{1F955}', '\u{1F956}', + '\u{1F957}', '\u{1F958}', '\u{1F959}', '\u{1F95A}', '\u{1F95B}', '\u{1F95C}', '\u{1F95D}', '\u{1F95E}', + '\u{1F95F}', '\u{1F960}', '\u{1F961}', '\u{1F962}', '\u{1F963}', '\u{1F964}', '\u{1F965}', '\u{1F966}', + '\u{1F967}', '\u{1F968}', '\u{1F969}', '\u{1F96A}', '\u{1F96B}', '\u{1F96C}', '\u{1F96D}', '\u{1F96E}', + '\u{1F96F}', '\u{1F970}', '\u{1F971}', '\u{1F972}', '\u{1F973}', '\u{1F974}', '\u{1F975}', '\u{1F976}', + '\u{1F977}', '\u{1F978}', '\u{1F979}', '\u{1F97A}', '\u{1F97B}', '\u{1F97C}', '\u{1F97D}', '\u{1F97E}', + '\u{1F97F}', '\u{1F980}', '\u{1F981}', '\u{1F982}', '\u{1F983}', '\u{1F984}', '\u{1F985}', '\u{1F986}', + '\u{1F987}', '\u{1F988}', '\u{1F989}', '\u{1F98A}', '\u{1F98B}', '\u{1F98C}', '\u{1F98D}', '\u{1F98E}', + '\u{1F98F}', '\u{1F990}', '\u{1F991}', '\u{1F992}', '\u{1F993}', '\u{1F994}', '\u{1F995}', '\u{1F996}', + '\u{1F997}', '\u{1F998}', '\u{1F999}', '\u{1F99A}', '\u{1F99B}', '\u{1F99C}', '\u{1F99D}', '\u{1F99E}', + '\u{1F99F}', '\u{1F9A0}', '\u{1F9A1}', '\u{1F9A2}', '\u{1F9A3}', '\u{1F9A4}', '\u{1F9A5}', '\u{1F9A6}', + '\u{1F9A7}', '\u{1F9A8}', '\u{1F9A9}', '\u{1F9AA}', '\u{1F9AB}', '\u{1F9AC}', '\u{1F9AD}', '\u{1F9AE}', + '\u{1F9AF}', '\u{1F9B0}', '\u{1F9B1}', '\u{1F9B2}', '\u{1F9B3}', '\u{1F9B4}', '\u{1F9B5}', '\u{1F9B6}', + '\u{1F9B7}', '\u{1F9B8}', '\u{1F9B9}', '\u{1F9BA}', '\u{1F9BB}', '\u{1F9BC}', '\u{1F9BD}', '\u{1F9BE}', + '\u{1F9BF}', '\u{1F9C0}', '\u{1F9C1}', '\u{1F9C2}', '\u{1F9C3}', '\u{1F9C4}', '\u{1F9C5}', '\u{1F9C6}', + '\u{1F9C7}', '\u{1F9C8}', '\u{1F9C9}', '\u{1F9CA}', '\u{1F9CB}', '\u{1F9CC}', '\u{1F9CD}', '\u{1F9CE}', + '\u{1F9CF}', '\u{1F9D0}', '\u{1F9D1}', '\u{1F9D2}', '\u{1F9D3}', '\u{1F9D4}', '\u{1F9D5}', '\u{1F9D6}', + '\u{1F9D7}', '\u{1F9D8}', '\u{1F9D9}', '\u{1F9DA}', '\u{1F9DB}', '\u{1F9DC}', '\u{1F9DD}', '\u{1F9DE}', + '\u{1F9DF}', '\u{1F9E0}', '\u{1F9E1}', '\u{1F9E2}', '\u{1F9E3}', '\u{1F9E4}', '\u{1F9E5}', '\u{1F9E6}', + '\u{1F9E7}', '\u{1F9E8}', '\u{1F9E9}', '\u{1F9EA}', '\u{1F9EB}', '\u{1F9EC}', '\u{1F9ED}', '\u{1F9EE}', + '\u{1F9EF}', '\u{1F9F0}', '\u{1F9F1}', '\u{1F9F2}', '\u{1F9F3}', '\u{1F9F4}', '\u{1F9F5}', '\u{1F9F6}', + '\u{1F9F7}', '\u{1F9F8}', '\u{1F9F9}', '\u{1F9FA}', '\u{1F9FB}', '\u{1F9FC}', '\u{1F9FD}', '\u{1F9FE}', + '\u{1F9FF}', '\u{1FA00}', '\u{1FA01}', '\u{1FA02}', '\u{1FA03}', '\u{1FA04}', '\u{1FA05}', '\u{1FA06}', + '\u{1FA07}', '\u{1FA08}', '\u{1FA09}', '\u{1FA0A}', '\u{1FA0B}', '\u{1FA0C}', '\u{1FA0D}', '\u{1FA0E}', + '\u{1FA0F}', '\u{1FA10}', '\u{1FA11}', '\u{1FA12}', '\u{1FA13}', '\u{1FA14}', '\u{1FA15}', '\u{1FA16}', + '\u{1FA17}', '\u{1FA18}', '\u{1FA19}', '\u{1FA1A}', '\u{1FA1B}', '\u{1FA1C}', '\u{1FA1D}', '\u{1FA1E}', + '\u{1FA1F}', '\u{1FA20}', '\u{1FA21}', '\u{1FA22}', '\u{1FA23}', '\u{1FA24}', '\u{1FA25}', '\u{1FA26}', + '\u{1FA27}', '\u{1FA28}', '\u{1FA29}', '\u{1FA2A}', '\u{1FA2B}', '\u{1FA2C}', '\u{1FA2D}', '\u{1FA2E}', + '\u{1FA2F}', '\u{1FA30}', '\u{1FA31}', '\u{1FA32}', '\u{1FA33}', '\u{1FA34}', '\u{1FA35}', '\u{1FA36}', + '\u{1FA37}', '\u{1FA38}', '\u{1FA39}', '\u{1FA3A}', '\u{1FA3B}', '\u{1FA3C}', '\u{1FA3D}', '\u{1FA3E}', + '\u{1FA3F}', '\u{1FA40}', '\u{1FA41}', '\u{1FA42}', '\u{1FA43}', '\u{1FA44}', '\u{1FA45}', '\u{1FA46}', + '\u{1FA47}', '\u{1FA48}', '\u{1FA49}', '\u{1FA4A}', '\u{1FA4B}', '\u{1FA4C}', '\u{1FA4D}', '\u{1FA4E}', + '\u{1FA4F}', '\u{1FA50}', '\u{1FA51}', '\u{1FA52}', '\u{1FA53}', '\u{1FA60}', '\u{1FA61}', '\u{1FA62}', + '\u{1FA63}', '\u{1FA64}', '\u{1FA65}', '\u{1FA66}', '\u{1FA67}', '\u{1FA68}', '\u{1FA69}', '\u{1FA6A}', + '\u{1FA6B}', '\u{1FA6C}', '\u{1FA6D}', '\u{1FA70}', '\u{1FA71}', '\u{1FA72}', '\u{1FA73}', '\u{1FA74}', + '\u{1FA78}', '\u{1FA79}', '\u{1FA7A}', '\u{1FA7B}', '\u{1FA7C}', '\u{1FA80}', '\u{1FA81}', '\u{1FA82}', + '\u{1FA83}', '\u{1FA84}', '\u{1FA85}', '\u{1FA86}', '\u{1FA90}', '\u{1FA91}', '\u{1FA92}', '\u{1FA93}', + '\u{1FA94}', '\u{1FA95}', '\u{1FA96}', '\u{1FA97}', '\u{1FA98}', '\u{1FA99}', '\u{1FA9A}', '\u{1FA9B}', + '\u{1FA9C}', '\u{1FA9D}', '\u{1FA9E}', '\u{1FA9F}', '\u{1FAA0}', '\u{1FAA1}', '\u{1FAA2}', '\u{1FAA3}', + '\u{1FAA4}', '\u{1FAA5}', '\u{1FAA6}', '\u{1FAA7}', '\u{1FAA8}', '\u{1FAA9}', '\u{1FAAA}', '\u{1FAAB}', + '\u{1FAAC}', '\u{1FAB0}', '\u{1FAB1}', '\u{1FAB2}', '\u{1FAB3}', '\u{1FAB4}', '\u{1FAB5}', '\u{1FAB6}', + '\u{1FAB7}', '\u{1FAB8}', '\u{1FAB9}', '\u{1FABA}', '\u{1FAC0}', '\u{1FAC1}', '\u{1FAC2}', '\u{1FAC3}', + '\u{1FAC4}', '\u{1FAC5}', '\u{1FAD0}', '\u{1FAD1}', '\u{1FAD2}', '\u{1FAD3}', '\u{1FAD4}', '\u{1FAD5}', + '\u{1FAD6}', '\u{1FAD7}', '\u{1FAD8}', '\u{1FAD9}', '\u{1FAE0}', '\u{1FAE1}', '\u{1FAE2}', '\u{1FAE3}', + '\u{1FAE4}', '\u{1FAE5}', '\u{1FAE6}', '\u{1FAE7}', '\u{1FAF0}', '\u{1FAF1}', '\u{1FAF2}', '\u{1FAF3}', + '\u{1FAF4}', '\u{1FAF5}', '\u{1FAF6}', '\u{1FB00}', '\u{1FB01}', '\u{1FB02}', '\u{1FB03}', '\u{1FB04}', + '\u{1FB05}', '\u{1FB06}', '\u{1FB07}', '\u{1FB08}', '\u{1FB09}', '\u{1FB0A}', '\u{1FB0B}', '\u{1FB0C}', + '\u{1FB0D}', '\u{1FB0E}', '\u{1FB0F}', '\u{1FB10}', '\u{1FB11}', '\u{1FB12}', '\u{1FB13}', '\u{1FB14}', + '\u{1FB15}', '\u{1FB16}', '\u{1FB17}', '\u{1FB18}', '\u{1FB19}', '\u{1FB1A}', '\u{1FB1B}', '\u{1FB1C}', + '\u{1FB1D}', '\u{1FB1E}', '\u{1FB1F}', '\u{1FB20}', '\u{1FB21}', '\u{1FB22}', '\u{1FB23}', '\u{1FB24}', + '\u{1FB25}', '\u{1FB26}', '\u{1FB27}', '\u{1FB28}', '\u{1FB29}', '\u{1FB2A}', '\u{1FB2B}', '\u{1FB2C}', + '\u{1FB2D}', '\u{1FB2E}', '\u{1FB2F}', '\u{1FB30}', '\u{1FB31}', '\u{1FB32}', '\u{1FB33}', '\u{1FB34}', + '\u{1FB35}', '\u{1FB36}', '\u{1FB37}', '\u{1FB38}', '\u{1FB39}', '\u{1FB3A}', '\u{1FB3B}', '\u{1FB3C}', + '\u{1FB3D}', '\u{1FB3E}', '\u{1FB3F}', '\u{1FB40}', '\u{1FB41}', '\u{1FB42}', '\u{1FB43}', '\u{1FB44}', + '\u{1FB45}', '\u{1FB46}', '\u{1FB47}', '\u{1FB48}', '\u{1FB49}', '\u{1FB4A}', '\u{1FB4B}', '\u{1FB4C}', + '\u{1FB4D}', '\u{1FB4E}', '\u{1FB4F}', '\u{1FB50}', '\u{1FB51}', '\u{1FB52}', '\u{1FB53}', '\u{1FB54}', + '\u{1FB55}', '\u{1FB56}', '\u{1FB57}', '\u{1FB58}', '\u{1FB59}', '\u{1FB5A}', '\u{1FB5B}', '\u{1FB5C}', + '\u{1FB5D}', '\u{1FB5E}', '\u{1FB5F}', '\u{1FB60}', '\u{1FB61}', '\u{1FB62}', '\u{1FB63}', '\u{1FB64}', + '\u{1FB65}', '\u{1FB66}', '\u{1FB67}', '\u{1FB68}', '\u{1FB69}', '\u{1FB6A}', '\u{1FB6B}', '\u{1FB6C}', + '\u{1FB6D}', '\u{1FB6E}', '\u{1FB6F}', '\u{1FB70}', '\u{1FB71}', '\u{1FB72}', '\u{1FB73}', '\u{1FB74}', + '\u{1FB75}', '\u{1FB76}', '\u{1FB77}', '\u{1FB78}', '\u{1FB79}', '\u{1FB7A}', '\u{1FB7B}', '\u{1FB7C}', + '\u{1FB7D}', '\u{1FB7E}', '\u{1FB7F}', '\u{1FB80}', '\u{1FB81}', '\u{1FB82}', '\u{1FB83}', '\u{1FB84}', + '\u{1FB85}', '\u{1FB86}', '\u{1FB87}', '\u{1FB88}', '\u{1FB89}', '\u{1FB8A}', '\u{1FB8B}', '\u{1FB8C}', + '\u{1FB8D}', '\u{1FB8E}', '\u{1FB8F}', '\u{1FB90}', '\u{1FB91}', '\u{1FB92}', '\u{1FB94}', '\u{1FB95}', + '\u{1FB96}', '\u{1FB97}', '\u{1FB98}', '\u{1FB99}', '\u{1FB9A}', '\u{1FB9B}', '\u{1FB9C}', '\u{1FB9D}', + '\u{1FB9E}', '\u{1FB9F}', '\u{1FBA0}', '\u{1FBA1}', '\u{1FBA2}', '\u{1FBA3}', '\u{1FBA4}', '\u{1FBA5}', + '\u{1FBA6}', '\u{1FBA7}', '\u{1FBA8}', '\u{1FBA9}', '\u{1FBAA}', '\u{1FBAB}', '\u{1FBAC}', '\u{1FBAD}', + '\u{1FBAE}', '\u{1FBAF}', '\u{1FBB0}', '\u{1FBB1}', '\u{1FBB2}', '\u{1FBB3}', '\u{1FBB4}', '\u{1FBB5}', + '\u{1FBB6}', '\u{1FBB7}', '\u{1FBB8}', '\u{1FBB9}', '\u{1FBBA}', '\u{1FBBB}', '\u{1FBBC}', '\u{1FBBD}', + '\u{1FBBE}', '\u{1FBBF}', '\u{1FBC0}', '\u{1FBC1}', '\u{1FBC2}', '\u{1FBC3}', '\u{1FBC4}', '\u{1FBC5}', + '\u{1FBC6}', '\u{1FBC7}', '\u{1FBC8}', '\u{1FBC9}', '\u{1FBCA}' + +}; + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + + use utils::suffixlist::PublicSuffix; + + use crate::bayes::tokenize::BayesTokenizer; + + #[test] + fn bayes_tokenizer() { + let inputs = [ + ( + "The quick brown fox jumps over the lazy dog", + vec!["quick", "brown", "fox", "jump", "lazi", "dog"], + ), + ( + "Jovencillo emponzoñado de whisky: ¡qué figurota exhibe!", + vec!["jovencill", "emponzoñ", "whisky", "figurot", "exhib"], + ), + ( + "Ma la volpe col suo balzo ha raggiunto il quieto Fido", + vec!["volp", "balz", "raggiunt", "quiet", "fid"], + ), + ( + "Jaz em prisão bota que vexa dez cegonhas felizes", + vec!["jaz", "prisã", "bot", "vex", "dez", "cegonh", "feliz"], + ), + ( + "Zwölf Boxkämpfer jagten Victor quer über den großen Sylter Deich", + vec![ + "zwolf", "boxkampf", "jagt", "victor", "quer", "gross", "sylt", "deich", + ], + ), + ( + "עטלף אבק נס דרך מזגן שהתפוצץ כי חם", + vec!["עטלף", "אבק", "נס", "דרך", "מזגן", "שהתפוצץ", "כי", "חם"], + ), + ( + "Съешь ещё этих мягких французских булок, да выпей же чаю", + vec![ + "съеш", + "ещё", + "эт", + "мягк", + "французск", + "булок", + "вып", + "ча", + ], + ), + ( + "Чуєш їх, доцю, га? Кумедна ж ти, прощайся без ґольфів!", + vec![ + "чуєш", + "їх", + "доцю", + "га", + "кумедна", + "ж", + "ти", + "прощайся", + "без", + "ґольфів", + ], + ), + ( + "Љубазни фењерџија чађавог лица хоће да ми покаже штос", + vec![ + "љубазни", + "фењерџија", + "чађавог", + "лица", + "хоће", + "да", + "ми", + "покаже", + "штос", + ], + ), + ( + "Pijamalı hasta yağız şoföre çabucak güvendi", + vec!["pijamalı", "hasta", "yağız", "şoför", "çabucak", "güvendi"], + ), + ("己所不欲,勿施于人。", vec!["己所不欲", "勿施于人"]), + ( + "井の中の蛙大海を知らず", + vec!["井", "の", "中", "の", "蛙大", "海", "を", "知ら", "ず"], + ), + ("시작이 반이다", vec!["시작이", "반이다"]), + ]; + + let suffixes = PublicSuffix::default(); + + for (input, expect) in inputs.iter() { + let input = BayesTokenizer::new(input, &suffixes).collect::>(); + let expect = expect.iter().copied().map(Cow::from).collect::>(); + + assert_eq!(input, expect,); + } + } +} diff --git a/crates/nlp/src/bayes/train.rs b/crates/nlp/src/bayes/train.rs index 7ba0881d..eabf9118 100644 --- a/crates/nlp/src/bayes/train.rs +++ b/crates/nlp/src/bayes/train.rs @@ -21,7 +21,7 @@ * for more details. */ -use crate::transformers::osb::OsbToken; +use crate::tokenizers::osb::OsbToken; use super::{BayesModel, TokenHash}; diff --git a/crates/nlp/src/language/mod.rs b/crates/nlp/src/language/mod.rs index edc87368..532c3d34 100644 --- a/crates/nlp/src/language/mod.rs +++ b/crates/nlp/src/language/mod.rs @@ -21,6 +21,10 @@ * for more details. */ +pub mod detect; +pub mod stemmer; +pub mod stopwords; + use std::borrow::Cow; use crate::tokenizers::{ @@ -29,9 +33,6 @@ use crate::tokenizers::{ use self::detect::LanguageDetector; -pub mod detect; -pub mod stemmer; - pub type LanguageTokenizer<'x> = Box>> + 'x>; impl Language { @@ -131,57 +132,9 @@ pub enum Language { impl Language { pub fn from_iso_639(code: &str) -> Option { - match code.split_once('-').map(|c| c.0).unwrap_or(code) { - "en" => Language::English, - "es" => Language::Spanish, - "pt" => Language::Portuguese, - "it" => Language::Italian, - "fr" => Language::French, - "de" => Language::German, - "ru" => Language::Russian, - "zh" => Language::Mandarin, - "ja" => Language::Japanese, - "ar" => Language::Arabic, - "hi" => Language::Hindi, - "ko" => Language::Korean, - "bn" => Language::Bengali, - "he" => Language::Hebrew, - "ur" => Language::Urdu, - "fa" => Language::Persian, - "ml" => Language::Malayalam, - "or" => Language::Oriya, - "my" => Language::Burmese, - "ne" => Language::Nepali, - "si" => Language::Sinhalese, - "km" => Language::Khmer, - "tk" => Language::Turkmen, - "am" => Language::Amharic, - "az" => Language::Azerbaijani, - "id" => Language::Indonesian, - "te" => Language::Telugu, - "ta" => Language::Tamil, - "vi" => Language::Vietnamese, - "gu" => Language::Gujarati, - "pa" => Language::Punjabi, - "uz" => Language::Uzbek, - "hy" => Language::Armenian, - "ka" => Language::Georgian, - "la" => Language::Latin, - "sl" => Language::Slovene, - "hr" => Language::Croatian, - "sr" => Language::Serbian, - "mk" => Language::Macedonian, - "lt" => Language::Lithuanian, - "lv" => Language::Latvian, - "et" => Language::Estonian, - "tl" => Language::Tagalog, - "af" => Language::Afrikaans, - "zu" => Language::Zulu, - "sn" => Language::Shona, - "ak" => Language::Akan, - _ => return None, - } - .into() + LANG_ISO + .get(code.split_once('-').map(|c| c.0).unwrap_or(code)) + .copied() } } @@ -200,3 +153,53 @@ impl Language { } } } + +static LANG_ISO: phf::Map<&'static str, Language> = phf::phf_map! { + "en" => Language::English, + "es" => Language::Spanish, + "pt" => Language::Portuguese, + "it" => Language::Italian, + "fr" => Language::French, + "de" => Language::German, + "ru" => Language::Russian, + "zh" => Language::Mandarin, + "ja" => Language::Japanese, + "ar" => Language::Arabic, + "hi" => Language::Hindi, + "ko" => Language::Korean, + "bn" => Language::Bengali, + "he" => Language::Hebrew, + "ur" => Language::Urdu, + "fa" => Language::Persian, + "ml" => Language::Malayalam, + "or" => Language::Oriya, + "my" => Language::Burmese, + "ne" => Language::Nepali, + "si" => Language::Sinhalese, + "km" => Language::Khmer, + "tk" => Language::Turkmen, + "am" => Language::Amharic, + "az" => Language::Azerbaijani, + "id" => Language::Indonesian, + "te" => Language::Telugu, + "ta" => Language::Tamil, + "vi" => Language::Vietnamese, + "gu" => Language::Gujarati, + "pa" => Language::Punjabi, + "uz" => Language::Uzbek, + "hy" => Language::Armenian, + "ka" => Language::Georgian, + "la" => Language::Latin, + "sl" => Language::Slovene, + "hr" => Language::Croatian, + "sr" => Language::Serbian, + "mk" => Language::Macedonian, + "lt" => Language::Lithuanian, + "lv" => Language::Latvian, + "et" => Language::Estonian, + "tl" => Language::Tagalog, + "af" => Language::Afrikaans, + "zu" => Language::Zulu, + "sn" => Language::Shona, + "ak" => Language::Akan, +}; diff --git a/crates/nlp/src/language/stemmer.rs b/crates/nlp/src/language/stemmer.rs index cd3da5e2..9dfc30b9 100644 --- a/crates/nlp/src/language/stemmer.rs +++ b/crates/nlp/src/language/stemmer.rs @@ -70,7 +70,7 @@ impl<'x> Iterator for Stemmer<'x> { } } -static STEMMER_MAP: &[Option] = &[ +pub static STEMMER_MAP: &[Option] = &[ None, // Esperanto = 0, Some(Algorithm::English), // English = 1, Some(Algorithm::Russian), // Russian = 2, diff --git a/crates/nlp/src/language/stopwords.rs b/crates/nlp/src/language/stopwords.rs new file mode 100644 index 00000000..3c661ae3 --- /dev/null +++ b/crates/nlp/src/language/stopwords.rs @@ -0,0 +1,4192 @@ +/* + * Copyright (c) 2023, Stalwart Labs Ltd. + * + * This file is part of Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use phf::{phf_set, Set}; +pub static STOP_WORDS: &[Option<&Set<&'static str>>] = &[ + None, // Esperanto = 0, + Some(&ENGLISH), // English = 1, + Some(&RUSSIAN), // Russian = 2, + None, // Mandarin = 3, + Some(&SPANISH), // Spanish = 4, + Some(&PORTUGUESE), // Portuguese = 5, + Some(&ITALIAN), // Italian = 6, + None, // Bengali = 7, + Some(&FRENCH), // French = 8, + Some(&GERMAN), // German = 9, + None, // Ukrainian = 10, + None, // Georgian = 11, + Some(&ARABIC), // Arabic = 12, + None, // Hindi = 13, + None, // Japanese = 14, + None, // Hebrew = 15, + None, // Yiddish = 16, + None, // Polish = 17, + None, // Amharic = 18, + None, // Javanese = 19, + None, // Korean = 20, + Some(&NORWEGIAN), // Bokmal = 21, + Some(&DANISH), // Danish = 22, + Some(&SWEDISH), // Swedish = 23, + Some(&FINNISH), // Finnish = 24, + Some(&TURKISH), // Turkish = 25, + Some(&DUTCH), // Dutch = 26, + Some(&HUNGARIAN), // Hungarian = 27, + None, // Czech = 28, + Some(&GREEK), // Greek = 29, + None, // Bulgarian = 30, + None, // Belarusian = 31, + None, // Marathi = 32, + None, // Kannada = 33, + Some(&ROMANIAN), // Romanian = 34, + None, // Slovene = 35, + None, // Croatian = 36, + None, // Serbian = 37, + None, // Macedonian = 38, + None, // Lithuanian = 39, + None, // Latvian = 40, + None, // Estonian = 41, + None, // Tamil = 42, + None, // Vietnamese = 43, + None, // Urdu = 44, + None, // Thai = 45, + None, // Gujarati = 46, + None, // Uzbek = 47, + None, // Punjabi = 48, + Some(&AZERBAIJANI), // Azerbaijani = 49, + None, // Indonesian = 50, + None, // Telugu = 51, + None, // Persian = 52, + None, // Malayalam = 53, + None, // Oriya = 54, + None, // Burmese = 55, + Some(&NEPALI), // Nepali = 56, + None, // Sinhalese = 57, + None, // Khmer = 58, + None, // Turkmen = 59, + None, // Akan = 60, + None, // Zulu = 61, + None, // Shona = 62, + None, // Afrikaans = 63, + None, // Latin = 64, + None, // Slovak = 65, + None, // Catalan = 66, + None, // Tagalog = 67, + None, // Armenian = 68, + None, // Unknown = 69, +]; + +static ARABIC: Set<&'static str> = phf_set! { + "آه", + "آي", + "أف", + "أم", + "أن", + "أو", + "أي", + "إذ", + "إن", + "إي", + "بخ", + "بس", + "بك", + "بل", + "به", + "بي", + "ته", + "تي", + "ثم", + "ذا", + "ذه", + "ذو", + "ذي", + "عل", + "عن", + "في", + "قد", + "كل", + "كم", + "كي", + "لا", + "لك", + "لم", + "لن", + "له", + "لو", + "لي", + "ما", + "مذ", + "مع", + "من", + "مه", + "ها", + "هل", + "هم", + "هن", + "هو", + "هي", + "يا", + "آها", + "أقل", + "ألا", + "أما", + "أنا", + "أنت", + "أنى", + "أوه", + "أين", + "إذا", + "إذن", + "إلا", + "إلى", + "إما", + "إنا", + "إنه", + "إيه", + "بعد", + "بعض", + "بكم", + "بكن", + "بلى", + "بما", + "بمن", + "بنا", + "بها", + "بهم", + "بهن", + "بيد", + "بين", + "تلك", + "تين", + "ثمة", + "حتى", + "حيث", + "حين", + "خلا", + "دون", + "ذات", + "ذاك", + "ذان", + "ذلك", + "ذوا", + "ذين", + "ريث", + "سوف", + "سوى", + "عدا", + "عسى", + "على", + "عما", + "عند", + "غير", + "فإن", + "فلا", + "فمن", + "فيم", + "فيه", + "كأن", + "كأي", + "كذا", + "كلا", + "كما", + "كيت", + "كيف", + "لئن", + "لدى", + "لست", + "لسن", + "لعل", + "لكم", + "لكن", + "لكي", + "لما", + "لنا", + "لها", + "لهم", + "لهن", + "ليت", + "ليس", + "متى", + "مما", + "ممن", + "منذ", + "منه", + "نحن", + "نحو", + "نعم", + "هاك", + "هذا", + "هذه", + "هذي", + "هلا", + "هما", + "هنا", + "هيا", + "هيت", + "وإذ", + "وإن", + "ولا", + "ولو", + "وما", + "ومن", + "وهو", + "أكثر", + "أنتم", + "أنتن", + "أيها", + "إذما", + "إليك", + "إنما", + "التي", + "الذي", + "بكما", + "بهما", + "تلكم", + "تينك", + "حاشا", + "حبذا", + "ذانك", + "ذلكم", + "ذلكن", + "ذينك", + "شتان", + "عليك", + "عليه", + "فإذا", + "فيما", + "فيها", + "كأين", + "كذلك", + "كلتا", + "كلما", + "لستم", + "لستن", + "لسنا", + "لكما", + "لهما", + "لولا", + "لوما", + "ليسا", + "ليست", + "ماذا", + "منها", + "مهما", + "هاته", + "هاتي", + "هذان", + "هذين", + "هكذا", + "هناك", + "وإذا", + "ولكن", + "أنتما", + "أولئك", + "أولاء", + "أينما", + "إليكم", + "إليكن", + "الذين", + "بماذا", + "تلكما", + "حيثما", + "ذلكما", + "ذواتا", + "ذواتي", + "كأنما", + "كيفما", + "لستما", + "لكنما", + "لكيلا", + "ليستا", + "ليسوا", + "هؤلاء", + "هاتان", + "هاتين", + "هاهنا", + "هنالك", + "هيهات", + "والذي", + "إليكما", + "اللائي", + "اللاتي", + "اللتان", + "اللتيا", + "اللتين", + "اللذان", + "اللذين", + "كلاهما", + "كليكما", + "كليهما", + "لاسيما", + "والذين", + "اللواتي", +}; + +static AZERBAIJANI: Set<&'static str> = phf_set! { + "a", + "ad", + "altmış", + "altı", + "amma", + "arasında", + "artıq", + "ay", + "az", + "bax", + "belə", + "beş", + "bilər", + "bir", + "biraz", + "biri", + "birşey", + "biz", + "bizim", + "bizlər", + "bu", + "buna", + "bundan", + "bunların", + "bunu", + "bunun", + "buradan", + "bütün", + "bəli", + "bəlkə", + "bəy", + "bəzi", + "bəzən", + "ci", + "çox", + "cu", + "cü", + "çünki", + "cı", + "da", + "daha", + "dedi", + "deyil", + "dir", + "doqquz", + "doqsan", + "dörd", + "düz", + "də", + "dək", + "dən", + "dəqiqə", + "edir", + "edən", + "elə", + "et", + "etdi", + "etmə", + "etmək", + "faiz", + "gilə", + "görə", + "ha", + "haqqında", + "harada", + "heç", + "hə", + "həm", + "həmin", + "həmişə", + "hər", + "idi", + "iki", + "il", + "ildə", + "ilk", + "ilə", + "in", + "indi", + "istifadə", + "isə", + "iyirmi", + "ki", + "kim", + "kimi", + "kimə", + "lakin", + "lap", + "mirşey", + "məhz", + "mən", + "mənə", + "niyə", + "nə", + "nəhayət", + "o", + "obirisi", + "of", + "olan", + "olar", + "olaraq", + "oldu", + "olduğu", + "olmadı", + "olmaz", + "olmuşdur", + "olsun", + "olur", + "on", + "ona", + "ondan", + "onlar", + "onlardan", + "onların", + "onsuzda", + "onu", + "onun", + "oradan", + "otuz", + "öz", + "özü", + "qarşı", + "qədər", + "qırx", + "saat", + "sadəcə", + "saniyə", + "siz", + "sizin", + "sizlər", + "sonra", + "səhv", + "səkkiz", + "səksən", + "sən", + "sənin", + "sənə", + "təəssüf", + "ü", + "üç", + "üçün", + "var", + "və", + "xan", + "xanım", + "xeyr", + "ya", + "yalnız", + "yaxşı", + "yeddi", + "yenə", + "yetmiş", + "yox", + "yoxdur", + "yoxsa", + "yüz", + "yəni", + "zaman", + "ı", + "ə", + "əgər", + "əlbəttə", + "əlli", + "ən", + "əslində", +}; + +static DANISH: Set<&'static str> = phf_set! { + "ad", + "af", + "alle", + "alt", + "anden", + "at", + "blev", + "blive", + "bliver", + "da", + "de", + "dem", + "den", + "denne", + "der", + "deres", + "det", + "dette", + "dig", + "din", + "disse", + "dog", + "du", + "efter", + "eller", + "en", + "end", + "er", + "et", + "for", + "fra", + "ham", + "han", + "hans", + "har", + "havde", + "have", + "hende", + "hendes", + "her", + "hos", + "hun", + "hvad", + "hvis", + "hvor", + "i", + "ikke", + "ind", + "jeg", + "jer", + "jo", + "kunne", + "man", + "mange", + "med", + "meget", + "men", + "mig", + "min", + "mine", + "mit", + "mod", + "når", + "ned", + "noget", + "nogle", + "nu", + "og", + "også", + "om", + "op", + "os", + "over", + "på", + "sådan", + "selv", + "sig", + "sin", + "sine", + "sit", + "skal", + "skulle", + "som", + "thi", + "til", + "ud", + "under", + "var", + "være", + "været", + "vi", + "vil", + "ville", + "vor", +}; + +static DUTCH: Set<&'static str> = phf_set! { + "aan", + "al", + "alles", + "als", + "altijd", + "andere", + "ben", + "bij", + "daar", + "dan", + "dat", + "de", + "der", + "deze", + "die", + "dit", + "doch", + "doen", + "door", + "dus", + "een", + "eens", + "en", + "er", + "ge", + "geen", + "geweest", + "haar", + "had", + "heb", + "hebben", + "heeft", + "hem", + "het", + "hier", + "hij", + "hoe", + "hun", + "iemand", + "iets", + "ik", + "in", + "is", + "ja", + "je", + "kan", + "kon", + "kunnen", + "maar", + "me", + "meer", + "men", + "met", + "mij", + "mijn", + "moet", + "na", + "naar", + "niet", + "niets", + "nog", + "nu", + "of", + "om", + "omdat", + "onder", + "ons", + "ook", + "op", + "over", + "reeds", + "te", + "tegen", + "toch", + "toen", + "tot", + "u", + "uit", + "uw", + "van", + "veel", + "voor", + "want", + "waren", + "was", + "wat", + "werd", + "wezen", + "wie", + "wil", + "worden", + "wordt", + "zal", + "ze", + "zelf", + "zich", + "zij", + "zijn", + "zo", + "zonder", + "zou", +}; + +static ENGLISH: Set<&'static str> = phf_set! { + "a", + "about", + "above", + "after", + "again", + "against", + "ain", + "all", + "am", + "an", + "and", + "any", + "are", + "aren", + "aren't", + "as", + "at", + "be", + "because", + "been", + "before", + "being", + "below", + "between", + "both", + "but", + "by", + "can", + "couldn", + "couldn't", + "d", + "did", + "didn", + "didn't", + "do", + "does", + "doesn", + "doesn't", + "doing", + "don", + "don't", + "down", + "during", + "each", + "few", + "for", + "from", + "further", + "had", + "hadn", + "hadn't", + "has", + "hasn", + "hasn't", + "have", + "haven", + "haven't", + "having", + "he", + "her", + "here", + "hers", + "herself", + "him", + "himself", + "his", + "how", + "i", + "if", + "in", + "into", + "is", + "isn", + "isn't", + "it", + "it's", + "its", + "itself", + "just", + "ll", + "m", + "ma", + "me", + "mightn", + "mightn't", + "more", + "most", + "mustn", + "mustn't", + "my", + "myself", + "needn", + "needn't", + "no", + "nor", + "not", + "now", + "o", + "of", + "off", + "on", + "once", + "only", + "or", + "other", + "our", + "ours", + "ourselves", + "out", + "over", + "own", + "re", + "s", + "same", + "shan", + "shan't", + "she", + "she's", + "should", + "should've", + "shouldn", + "shouldn't", + "so", + "some", + "such", + "t", + "than", + "that", + "that'll", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "there", + "these", + "they", + "this", + "those", + "through", + "to", + "too", + "under", + "until", + "up", + "ve", + "very", + "was", + "wasn", + "wasn't", + "we", + "were", + "weren", + "weren't", + "what", + "when", + "where", + "which", + "while", + "who", + "whom", + "why", + "will", + "with", + "won", + "won't", + "wouldn", + "wouldn't", + "y", + "you", + "you'd", + "you'll", + "you're", + "you've", + "your", + "yours", + "yourself", + "yourselves", +}; + +static FINNISH: Set<&'static str> = phf_set! { + "ei", + "eivät", + "emme", + "en", + "et", + "että", + "ette", + "hän", + "häneen", + "hänellä", + "hänelle", + "häneltä", + "hänen", + "hänessä", + "hänestä", + "hänet", + "häntä", + "he", + "heidän", + "heidät", + "heihin", + "heillä", + "heille", + "heiltä", + "heissä", + "heistä", + "heitä", + "itse", + "ja", + "johon", + "joiden", + "joihin", + "joiksi", + "joilla", + "joille", + "joilta", + "joina", + "joissa", + "joista", + "joita", + "joka", + "joksi", + "jolla", + "jolle", + "jolta", + "jona", + "jonka", + "jos", + "jossa", + "josta", + "jota", + "jotka", + "kanssa", + "keiden", + "keihin", + "keiksi", + "keillä", + "keille", + "keiltä", + "keinä", + "keissä", + "keistä", + "keitä", + "keneen", + "keneksi", + "kenellä", + "kenelle", + "keneltä", + "kenen", + "kenenä", + "kenessä", + "kenestä", + "kenet", + "ketä", + "ketkä", + "koska", + "kuin", + "kuka", + "kun", + "me", + "meidän", + "meidät", + "meihin", + "meillä", + "meille", + "meiltä", + "meissä", + "meistä", + "meitä", + "mihin", + "mikä", + "miksi", + "millä", + "mille", + "miltä", + "minä", + "minkä", + "minua", + "minulla", + "minulle", + "minulta", + "minun", + "minussa", + "minusta", + "minut", + "minuun", + "missä", + "mistä", + "mitä", + "mitkä", + "mukaan", + "mutta", + "näiden", + "näihin", + "näiksi", + "näillä", + "näille", + "näiltä", + "näinä", + "näissä", + "näistä", + "näitä", + "nämä", + "ne", + "niiden", + "niihin", + "niiksi", + "niillä", + "niille", + "niiltä", + "niin", + "niinä", + "niissä", + "niistä", + "niitä", + "noiden", + "noihin", + "noiksi", + "noilla", + "noille", + "noilta", + "noin", + "noina", + "noissa", + "noista", + "noita", + "nuo", + "nyt", + "ole", + "olemme", + "olen", + "olet", + "olette", + "oli", + "olimme", + "olin", + "olisi", + "olisimme", + "olisin", + "olisit", + "olisitte", + "olisivat", + "olit", + "olitte", + "olivat", + "olla", + "olleet", + "ollut", + "on", + "ovat", + "poikki", + "se", + "sekä", + "sen", + "siihen", + "siinä", + "siitä", + "siksi", + "sillä", + "sille", + "siltä", + "sinä", + "sinua", + "sinulla", + "sinulle", + "sinulta", + "sinun", + "sinussa", + "sinusta", + "sinut", + "sinuun", + "sitä", + "tähän", + "tai", + "täksi", + "tallä", + "tälle", + "tältä", + "tämä", + "tämän", + "tänä", + "tässä", + "tästä", + "tätä", + "te", + "teidän", + "teidät", + "teihin", + "teillä", + "teille", + "teiltä", + "teissä", + "teistä", + "teitä", + "tuo", + "tuohon", + "tuoksi", + "tuolla", + "tuolle", + "tuolta", + "tuon", + "tuona", + "tuossa", + "tuosta", + "tuotä", + "vaan", + "vai", + "vaikka", + "yli", +}; + +static FRENCH: Set<&'static str> = phf_set! { + "à", + "ai", + "aie", + "aient", + "aies", + "ait", + "as", + "au", + "aura", + "aurai", + "auraient", + "aurais", + "aurait", + "auras", + "aurez", + "auriez", + "aurions", + "aurons", + "auront", + "aux", + "avaient", + "avais", + "avait", + "avec", + "avez", + "aviez", + "avions", + "avons", + "ayant", + "ayante", + "ayantes", + "ayants", + "ayez", + "ayons", + "c", + "ce", + "ces", + "d", + "dans", + "de", + "des", + "du", + "elle", + "en", + "es", + "est", + "et", + "étaient", + "étais", + "était", + "étant", + "étante", + "étantes", + "étants", + "été", + "étée", + "étées", + "étés", + "êtes", + "étiez", + "étions", + "eu", + "eue", + "eues", + "eûmes", + "eurent", + "eus", + "eusse", + "eussent", + "eusses", + "eussiez", + "eussions", + "eut", + "eût", + "eûtes", + "eux", + "fûmes", + "furent", + "fus", + "fusse", + "fussent", + "fusses", + "fussiez", + "fussions", + "fut", + "fût", + "fûtes", + "il", + "j", + "je", + "l", + "la", + "le", + "leur", + "lui", + "m", + "ma", + "mais", + "me", + "même", + "mes", + "moi", + "mon", + "n", + "ne", + "nos", + "notre", + "nous", + "on", + "ont", + "ou", + "par", + "pas", + "pour", + "qu", + "que", + "qui", + "s", + "sa", + "se", + "sera", + "serai", + "seraient", + "serais", + "serait", + "seras", + "serez", + "seriez", + "serions", + "serons", + "seront", + "ses", + "soient", + "sois", + "soit", + "sommes", + "son", + "sont", + "soyez", + "soyons", + "suis", + "sur", + "t", + "ta", + "te", + "tes", + "toi", + "ton", + "tu", + "un", + "une", + "vos", + "votre", + "vous", + "y", +}; + +static GERMAN: Set<&'static str> = phf_set! { + "aber", + "alle", + "allem", + "allen", + "aller", + "alles", + "als", + "also", + "am", + "an", + "ander", + "andere", + "anderem", + "anderen", + "anderer", + "anderes", + "anderm", + "andern", + "anderr", + "anders", + "auch", + "auf", + "aus", + "bei", + "bin", + "bis", + "bist", + "da", + "damit", + "dann", + "das", + "dasselbe", + "dazu", + "daß", + "dein", + "deine", + "deinem", + "deinen", + "deiner", + "deines", + "dem", + "demselben", + "den", + "denn", + "denselben", + "der", + "derer", + "derselbe", + "derselben", + "des", + "desselben", + "dessen", + "dich", + "die", + "dies", + "diese", + "dieselbe", + "dieselben", + "diesem", + "diesen", + "dieser", + "dieses", + "dir", + "doch", + "dort", + "du", + "durch", + "ein", + "eine", + "einem", + "einen", + "einer", + "eines", + "einig", + "einige", + "einigem", + "einigen", + "einiger", + "einiges", + "einmal", + "er", + "es", + "etwas", + "euch", + "euer", + "eure", + "eurem", + "euren", + "eurer", + "eures", + "für", + "gegen", + "gewesen", + "hab", + "habe", + "haben", + "hat", + "hatte", + "hatten", + "hier", + "hin", + "hinter", + "ich", + "ihm", + "ihn", + "ihnen", + "ihr", + "ihre", + "ihrem", + "ihren", + "ihrer", + "ihres", + "im", + "in", + "indem", + "ins", + "ist", + "jede", + "jedem", + "jeden", + "jeder", + "jedes", + "jene", + "jenem", + "jenen", + "jener", + "jenes", + "jetzt", + "kann", + "kein", + "keine", + "keinem", + "keinen", + "keiner", + "keines", + "können", + "könnte", + "machen", + "man", + "manche", + "manchem", + "manchen", + "mancher", + "manches", + "mein", + "meine", + "meinem", + "meinen", + "meiner", + "meines", + "mich", + "mir", + "mit", + "muss", + "musste", + "nach", + "nicht", + "nichts", + "noch", + "nun", + "nur", + "ob", + "oder", + "ohne", + "sehr", + "sein", + "seine", + "seinem", + "seinen", + "seiner", + "seines", + "selbst", + "sich", + "sie", + "sind", + "so", + "solche", + "solchem", + "solchen", + "solcher", + "solches", + "soll", + "sollte", + "sondern", + "sonst", + "über", + "um", + "und", + "uns", + "unser", + "unsere", + "unserem", + "unseren", + "unseres", + "unter", + "viel", + "vom", + "von", + "vor", + "während", + "war", + "waren", + "warst", + "was", + "weg", + "weil", + "weiter", + "welche", + "welchem", + "welchen", + "welcher", + "welches", + "wenn", + "werde", + "werden", + "wie", + "wieder", + "will", + "wir", + "wird", + "wirst", + "wo", + "wollen", + "wollte", + "würde", + "würden", + "zu", + "zum", + "zur", + "zwar", + "zwischen", +}; + +static GREEK: Set<&'static str> = phf_set! { + "η", + "κ", + "ο", + "ἃ", + "ἡ", + "ἢ", + "ἣ", + "ἤ", + "ἥ", + "ὁ", + "ὃ", + "ὅ", + "ὦ", + "ᾧ", + "δ'", + "αν", + "αἱ", + "αἳ", + "αἵ", + "αὖ", + "γα", + "γε", + "δέ", + "δή", + "δε", + "δὲ", + "δὴ", + "δ’", + "επ", + "εἰ", + "εἴ", + "θα", + "κι", + "μή", + "μα", + "με", + "μη", + "μὴ", + "να", + "οι", + "οἱ", + "οἳ", + "οὐ", + "οὗ", + "σε", + "σύ", + "σὺ", + "τά", + "τί", + "τα", + "τε", + "τι", + "το", + "τό", + "τὰ", + "τὸ", + "τῇ", + "τῷ", + "ωσ", + "ἀπ", + "ἀφ", + "ἂν", + "ἄν", + "ἐκ", + "ἐν", + "ἐξ", + "ἐφ", + "ἧς", + "ὃν", + "ὃς", + "ὅς", + "ὅσ", + "ὑπ", + "ὡς", + "ὡσ", + "ὥς", + "δι'", + "γα^", + "απο", + "γάρ", + "για", + "γὰρ", + "δαί", + "δαὶ", + "δεν", + "διά", + "διὰ", + "εαν", + "ενω", + "επι", + "εἰς", + "εἰσ", + "καί", + "καθ", + "και", + "κατ", + "καὶ", + "κἀν", + "κἂν", + "μέν", + "μεθ", + "μετ", + "μην", + "μἐν", + "μὲν", + "μὴν", + "οσο", + "οτι", + "οἷς", + "οὐδ", + "οὐκ", + "οὐχ", + "οὓς", + "οὖν", + "παρ", + "που", + "ποῦ", + "προ", + "πρὸ", + "πως", + "πωσ", + "στη", + "στο", + "σόσ", + "σύν", + "σὸς", + "σὺν", + "τήν", + "τίς", + "τίσ", + "την", + "τησ", + "τις", + "τισ", + "τοί", + "τοι", + "τον", + "του", + "τοῦ", + "των", + "τόν", + "τὰς", + "τὴν", + "τὸν", + "τῆς", + "τῆσ", + "τῶν", + "ἀπό", + "ἀπὸ", + "ἄρα", + "ἅμα", + "ἐάν", + "ἐγώ", + "ἐγὼ", + "ἐπί", + "ἐπὶ", + "ἐὰν", + "ἔτι", + "ἵνα", + "ὅδε", + "ὅτε", + "ὅτι", + "ὑπό", + "ὑπὸ", + "ἀλλ'", + "αλλα", + "αντι", + "αυτα", + "αυτη", + "αυτο", + "γοῦν", + "δαίσ", + "δαὶς", + "εἰμί", + "εἰμὶ", + "εἴμι", + "εἴτε", + "ισωσ", + "κατά", + "κατα", + "κατὰ", + "μήτε", + "μετά", + "μετα", + "μετὰ", + "ομωσ", + "οπωσ", + "οὐδέ", + "οὐδὲ", + "οὐχὶ", + "οὔτε", + "οὕτω", + "παρά", + "παρα", + "παρὰ", + "περί", + "περὶ", + "ποια", + "ποιο", + "ποτε", + "προσ", + "πρόσ", + "πρὸς", + "στην", + "στον", + "ταῖς", + "τινα", + "τοτε", + "τούσ", + "τοὺς", + "τοῖς", + "τότε", + "ἀλλά", + "ἀλλὰ", + "ἀλλ’", + "ἐμόσ", + "ἐμὸς", + "ἐπεὶ", + "ἐστι", + "ὅθεν", + "ὅπερ", + "ὑμόσ", + "ὑπέρ", + "ὑπὲρ", + "ὥστε", + "αυτεσ", + "αυτοι", + "αυτοσ", + "αυτων", + "αὐτόσ", + "αὐτὸς", + "ειμαι", + "ειναι", + "εισαι", + "ειστε", + "οὐδὲν", + "οὕτως", + "οὕτωσ", + "οὗτος", + "οὗτοσ", + "ποιεσ", + "ποιοι", + "ποιοσ", + "ποιων", + "ἄλλος", + "ἄλλοσ", + "ὅστις", + "ὅστισ", + "αυτουσ", + "εκεινα", + "εκεινη", + "εκεινο", + "καίτοι", + "οὐδείσ", + "οὐδεὶς", + "ποιουσ", + "ἑαυτοῦ", + "ειμαστε", + "εκεινεσ", + "εκεινοι", + "εκεινοσ", + "εκεινων", + "εκεινουσ", + "τοιοῦτος", + "τοιοῦτοσ", +}; + +static HUNGARIAN: Set<&'static str> = phf_set! { + "a", + "abban", + "ahhoz", + "ahogy", + "ahol", + "aki", + "akik", + "akkor", + "alatt", + "által", + "általában", + "amely", + "amelyek", + "amelyekben", + "amelyeket", + "amelyet", + "amelynek", + "ami", + "amíg", + "amikor", + "amit", + "amolyan", + "annak", + "arra", + "arról", + "át", + "az", + "azért", + "azok", + "azon", + "azonban", + "azt", + "aztán", + "azután", + "azzal", + "bár", + "be", + "belül", + "benne", + "cikk", + "cikkek", + "cikkeket", + "csak", + "de", + "e", + "ebben", + "eddig", + "egész", + "egy", + "egyéb", + "egyes", + "egyetlen", + "egyik", + "egyre", + "ehhez", + "ekkor", + "el", + "elég", + "ellen", + "elõ", + "elõször", + "elõtt", + "elsõ", + "emilyen", + "én", + "ennek", + "éppen", + "erre", + "és", + "ez", + "ezek", + "ezen", + "ezért", + "ezt", + "ezzel", + "fel", + "felé", + "hanem", + "hiszen", + "hogy", + "hogyan", + "igen", + "így", + "ill", + "ill.", + "illetve", + "ilyen", + "ilyenkor", + "ismét", + "ison", + "itt", + "jó", + "jobban", + "jól", + "kell", + "kellett", + "keressünk", + "keresztül", + "ki", + "kívül", + "között", + "közül", + "legalább", + "legyen", + "lehet", + "lehetett", + "lenne", + "lenni", + "lesz", + "lett", + "maga", + "magát", + "majd", + "már", + "más", + "másik", + "meg", + "még", + "mellett", + "mely", + "melyek", + "mert", + "mi", + "miért", + "míg", + "mikor", + "milyen", + "minden", + "mindenki", + "mindent", + "mindig", + "mint", + "mintha", + "mit", + "mivel", + "most", + "nagy", + "nagyobb", + "nagyon", + "ne", + "néha", + "néhány", + "nekem", + "neki", + "nélkül", + "nem", + "nincs", + "õ", + "õk", + "õket", + "olyan", + "össze", + "ott", + "pedig", + "persze", + "rá", + "s", + "saját", + "sem", + "semmi", + "sok", + "sokat", + "sokkal", + "számára", + "szemben", + "szerint", + "szinte", + "talán", + "tehát", + "teljes", + "több", + "tovább", + "továbbá", + "úgy", + "ugyanis", + "új", + "újabb", + "újra", + "után", + "utána", + "utolsó", + "vagy", + "vagyis", + "vagyok", + "valaki", + "valami", + "valamint", + "való", + "van", + "vannak", + "vele", + "vissza", + "viszont", + "volna", + "volt", + "voltak", + "voltam", + "voltunk", +}; + +static ITALIAN: Set<&'static str> = phf_set! { + "a", + "abbia", + "abbiamo", + "abbiano", + "abbiate", + "ad", + "agl", + "agli", + "ai", + "al", + "all", + "alla", + "alle", + "allo", + "anche", + "avemmo", + "avendo", + "avesse", + "avessero", + "avessi", + "avessimo", + "aveste", + "avesti", + "avete", + "aveva", + "avevamo", + "avevano", + "avevate", + "avevi", + "avevo", + "avrà", + "avrai", + "avranno", + "avrebbe", + "avrebbero", + "avrei", + "avremmo", + "avremo", + "avreste", + "avresti", + "avrete", + "avrò", + "avuta", + "avute", + "avuti", + "avuto", + "c", + "che", + "chi", + "ci", + "coi", + "col", + "come", + "con", + "contro", + "cui", + "da", + "dagl", + "dagli", + "dai", + "dal", + "dall", + "dalla", + "dalle", + "dallo", + "degl", + "degli", + "dei", + "del", + "dell", + "della", + "delle", + "dello", + "di", + "dov", + "dove", + "e", + "è", + "ebbe", + "ebbero", + "ebbi", + "ed", + "era", + "erano", + "eravamo", + "eravate", + "eri", + "ero", + "essendo", + "faccia", + "facciamo", + "facciano", + "facciate", + "faccio", + "facemmo", + "facendo", + "facesse", + "facessero", + "facessi", + "facessimo", + "faceste", + "facesti", + "faceva", + "facevamo", + "facevano", + "facevate", + "facevi", + "facevo", + "fai", + "fanno", + "farà", + "farai", + "faranno", + "farebbe", + "farebbero", + "farei", + "faremmo", + "faremo", + "fareste", + "faresti", + "farete", + "farò", + "fece", + "fecero", + "feci", + "fosse", + "fossero", + "fossi", + "fossimo", + "foste", + "fosti", + "fu", + "fui", + "fummo", + "furono", + "gli", + "ha", + "hai", + "hanno", + "ho", + "i", + "il", + "in", + "io", + "l", + "la", + "le", + "lei", + "li", + "lo", + "loro", + "lui", + "ma", + "mi", + "mia", + "mie", + "miei", + "mio", + "ne", + "negl", + "negli", + "nei", + "nel", + "nell", + "nella", + "nelle", + "nello", + "noi", + "non", + "nostra", + "nostre", + "nostri", + "nostro", + "o", + "per", + "perché", + "più", + "quale", + "quanta", + "quante", + "quanti", + "quanto", + "quella", + "quelle", + "quelli", + "quello", + "questa", + "queste", + "questi", + "questo", + "sarà", + "sarai", + "saranno", + "sarebbe", + "sarebbero", + "sarei", + "saremmo", + "saremo", + "sareste", + "saresti", + "sarete", + "sarò", + "se", + "sei", + "si", + "sia", + "siamo", + "siano", + "siate", + "siete", + "sono", + "sta", + "stai", + "stando", + "stanno", + "starà", + "starai", + "staranno", + "starebbe", + "starebbero", + "starei", + "staremmo", + "staremo", + "stareste", + "staresti", + "starete", + "starò", + "stava", + "stavamo", + "stavano", + "stavate", + "stavi", + "stavo", + "stemmo", + "stesse", + "stessero", + "stessi", + "stessimo", + "steste", + "stesti", + "stette", + "stettero", + "stetti", + "stia", + "stiamo", + "stiano", + "stiate", + "sto", + "su", + "sua", + "sue", + "sugl", + "sugli", + "sui", + "sul", + "sull", + "sulla", + "sulle", + "sullo", + "suo", + "suoi", + "ti", + "tra", + "tu", + "tua", + "tue", + "tuo", + "tuoi", + "tutti", + "tutto", + "un", + "una", + "uno", + "vi", + "voi", + "vostra", + "vostre", + "vostri", + "vostro", +}; + +/* +Not yet available for auto-detection + +static KAZAKH: Set<&'static str> = phf_set! { + "", + "е", + "о", + "я", + "ә", + "ай", + "ал", + "ау", + "ах", + "ей", + "еш", + "ие", + "кә", + "ой", + "ол", + "ох", + "па", + "уа", + "эй", + "эх", + "әй", + "өз", + "өй", + "ана", + "арс", + "аһа", + "бар", + "беу", + "біз", + "бұл", + "жоқ", + "кәһ", + "мен", + "моһ", + "осы", + "оһо", + "пай", + "сен", + "сол", + "соң", + "сіз", + "тек", + "тәк", + "уай", + "уау", + "ура", + "шек", + "ырс", + "ырқ", + "ыңқ", + "ірк", + "қап", + "құр", + "үйт", + "әні", + "өзі", + "арс-ұрс", + "пай-пай", + "паһ-паһ", + "қош-қош", + "анау", + "барқ", + "бері", + "бойы", + "болп", + "борт", + "былп", + "бүйт", + "бәрі", + "гүрс", + "гөрі", + "дүрс", + "дүңк", + "емес", + "жалп", + "желп", + "жуық", + "кірт", + "күрт", + "күңк", + "кәне", + "кәні", + "маңқ", + "морт", + "мына", + "мышы", + "мыңқ", + "міне", + "одан", + "олар", + "онда", + "оның", + "оған", + "пфша", + "пырс", + "пішә", + "сарт", + "саңқ", + "сона", + "сыңқ", + "тарс", + "таяу", + "тағы", + "таңқ", + "тырс", + "тыңқ", + "түге", + "шаңқ", + "шырт", + "шіңк", + "шәйт", + "ғана", + "қана", + "қолп", + "қорс", + "қоса", + "қыңқ", + "үшін", + "әйда", + "әрне", + "өзге", + "өзім", + "өзің", + "жалт-жалт", + "жалт-жұлт", + "сарт-сұрт", + "тарс-тұрс", + "шаңқ-шаңқ", + "шаңқ-шұңқ", + "қалт-қалт", + "қалт-құлт", + "қаңқ-қаңқ", + "қаңқ-құңқ", + "барша", + "бетер", + "бізге", + "бірақ", + "бірге", + "біреу", + "бүкіл", + "бұрын", + "дейін", + "ешбір", + "ешкім", + "кейін", + "күллі", + "күшім", + "маған", + "менде", + "менен", + "менің", + "мынау", + "пішту", + "сайын", + "салым", + "саған", + "сенде", + "сенен", + "сенің", + "солай", + "сонау", + "сорап", + "сізге", + "таман", + "тарта", + "түгел", + "шақты", + "шейін", + "ғұрлы", + "қарай", + "қатар", + "құрау", + "әрбір", + "әрине", + "әркім", + "әттең", + "әукім", + "өзіме", + "өзіне", + "сенен онан", + "арбаң-арбаң", + "батыр-бұтыр", + "далаң-далаң", + "митың-митың", + "салаң-сұлаң", + "құрау-құрау", + "ыржың-тыржың", + "алайда", + "алатау", + "алақай", + "арнайы", + "арқылы", + "барлық", + "бізбен", + "бізден", + "біздер", + "біздің", + "бұндай", + "дәнеңе", + "ештеме", + "кейбір", + "кәнеки", + "мұндай", + "оларға", + "онымен", + "осылай", + "осынау", + "себебі", + "сияқты", + "сондай", + "сізбен", + "сізден", + "сіздер", + "сіздің", + "тағыда", + "туралы", + "шамалы", + "шіркін", + "ғұрлым", + "қаралы", + "әлдене", + "өзінің", + "бүгжең-бүгжең", + "тарбаң-тарбаң", + "қайқаң-құйқаң", + "қаңғыр-күңгір", + "бойымен", + "бірдеме", + "бірнеше", + "ешқайсы", + "ешқашан", + "менімен", + "олардан", + "олардың", + "олармен", + "осындай", + "сенімен", + "сонымен", + "япырмай", + "әйтпесе", + "әлдекім", + "әншейін", + "әрқайсы", + "әрқалай", + "өзімнің", + "өйткені", + "әттеген-ай", + "арсалаң-арсалаң", + "ербелең-ербелең", + "қызараң-қызараң", + "айтпақшы", + "біздерге", + "дегенмен", + "ешқандай", + "кейбіреу", + "масқарай", + "мәссаған", + "ойпырмай", + "сіздерге", + "қайсыбір", + "әлденеше", + "алдақашан", + "біздерден", + "біздердің", + "біздермен", + "бәрекелді", + "сондықтан", + "сіздерден", + "сіздердің", + "сіздермен", + "әйткенмен", + "әлдеқалай", + "әлдеқашан", + "әттегенай", + "әлдеқайдан", + "астапыралла", + "жаракімалла", +}; +*/ + +static NEPALI: Set<&'static str> = phf_set! { + "छ", + "त", + "न", + "म", + "र", + "अब", + "आए", + "उप", + "एक", + "ओठ", + "औं", + "का", + "कि", + "के", + "को", + "गए", + "छु", + "छू", + "जब", + "जे", + "जो", + "तर", + "तल", + "ती", + "नि", + "नै", + "नौ", + "भए", + "भन", + "भर", + "मा", + "यस", + "या", + "यी", + "यो", + "ले", + "सो", + "हो", + "कम से कम", + "अझै", + "अरु", + "अलग", + "आदि", + "आफू", + "आयो", + "कतै", + "कसै", + "किन", + "गयौ", + "गरि", + "गरी", + "गैर", + "चार", + "छन्", + "छैन", + "छौं", + "जान", + "जुन", + "ठीक", + "तथा", + "तिर", + "तीन", + "थिए", + "दिए", + "दुई", + "पछि", + "पटक", + "पनि", + "बने", + "बरु", + "बीच", + "भने", + "भन्", + "यति", + "यदि", + "यसो", + "रही", + "रूप", + "लाई", + "संग", + "सधै", + "सबै", + "समय", + "सही", + "सात", + "साथ", + "हरे", + "हुन", + "अन्य", + "आजको", + "आत्म", + "उनको", + "उनले", + "एउटै", + "एकदम", + "कसरी", + "कुनै", + "कुरा", + "केही", + "कोही", + "गरेर", + "गरौं", + "गर्छ", + "गर्न", + "चाले", + "जबकि", + "जसको", + "जसमा", + "जसले", + "जहाँ", + "तपाई", + "तिनी", + "तिमी", + "त्यो", + "थिएन", + "थियो", + "देखि", + "देखे", + "धेरै", + "नत्र", + "नयाँ", + "पर्छ", + "पाँच", + "प्लस", + "फेरी", + "बारे", + "भएको", + "मलाई", + "माथि", + "मेरो", + "यसको", + "यसरी", + "यहाँ", + "राखे", + "लगभग", + "लागि", + "शायद", + "संगै", + "सक्छ", + "सम्म", + "साथै", + "सायद", + "सारा", + "सोही", + "हरेक", + "हुने", + "हुन्", + "अक्सर", + "अगाडी", + "अर्को", + "आफ्नै", + "आफ्नो", + "कसैले", + "कृपया", + "गरेका", + "गरेको", + "गर्छु", + "गर्दै", + "गर्नु", + "गर्ने", + "चाहिए", + "जसबाट", + "जसलाई", + "जस्तै", + "जस्तो", + "जाहिर", + "तापनी", + "देखेर", + "नजिकै", + "निम्न", + "पक्का", + "पक्कै", + "पहिले", + "पहिलो", + "पूर्व", + "प्रति", + "बाहिर", + "बाहेक", + "बिशेष", + "बीचमा", + "भन्छु", + "भन्दा", + "भन्ने", + "भित्र", + "मात्र", + "मुख्य", + "यसपछि", + "यस्तो", + "रहेका", + "रहेको", + "राख्छ", + "सट्टा", + "सम्भव", + "हुन्छ", + "अनुसार", + "अन्यथा", + "अरुलाई", + "अर्थात", + "आफूलाई", + "उदाहरण", + "उहालाई", + "किनभने", + "क्रमशः", + "जताततै", + "तत्काल", + "तपाईको", + "तेस्रो", + "त्यहाँ", + "त्सपछि", + "त्सैले", + "देखियो", + "देखेको", + "दोस्रो", + "निम्ति", + "पाँचौं", + "प्रतेक", + "भन्छन्", + "भित्री", + "यथोचित", + "यद्यपि", + "राम्रो", + "वरीपरी", + "सबैलाई", + "स्पष्ट", + "अन्यत्र", + "अर्थात्", + "कहाँबाट", + "चाहन्छु", + "तदनुसार", + "तिनीहरू", + "देखिन्छ", + "पछिल्लो", + "पर्थ्यो", + "पहिल्यै", + "बिरुद्ध", + "यसबाहेक", + "साँच्चै", + "अन्तर्गत", + "तुरुन्तै", + "तेस्कारण", + "दिनुभएको", + "पर्याप्त", + "भन्नुभयो", + "यहाँसम्म", + "वास्तवमा", + "गर्नुपर्छ", + "जस्तोसुकै", + "तिनीहरुको", + "दिनुहुन्छ", + "निर्दिष्ट", + "कहिलेकाहीं", + "चाहनुहुन्छ", + "तिनिहरुलाई", + "निम्नानुसार", +}; + +static NORWEGIAN: Set<&'static str> = phf_set! { + "å", + "alle", + "at", + "av", + "både", + "båe", + "bare", + "begge", + "ble", + "blei", + "bli", + "blir", + "blitt", + "da", + "då", + "de", + "deg", + "dei", + "deim", + "deira", + "deires", + "dem", + "den", + "denne", + "der", + "dere", + "deres", + "det", + "dette", + "di", + "din", + "disse", + "ditt", + "du", + "dykk", + "dykkar", + "eg", + "ein", + "eit", + "eitt", + "eller", + "elles", + "en", + "enn", + "er", + "et", + "ett", + "etter", + "for", + "før", + "fordi", + "fra", + "ha", + "hadde", + "han", + "hans", + "har", + "hennar", + "henne", + "hennes", + "her", + "hjå", + "ho", + "hoe", + "honom", + "hoss", + "hossen", + "hun", + "hva", + "hvem", + "hver", + "hvilke", + "hvilken", + "hvis", + "hvor", + "hvordan", + "hvorfor", + "i", + "ikke", + "ikkje", + "ingen", + "ingi", + "inkje", + "inn", + "inni", + "ja", + "jeg", + "kan", + "kom", + "korleis", + "korso", + "kun", + "kunne", + "kva", + "kvar", + "kvarhelst", + "kven", + "kvi", + "kvifor", + "man", + "mange", + "me", + "med", + "medan", + "meg", + "meget", + "mellom", + "men", + "mi", + "min", + "mine", + "mitt", + "mot", + "mykje", + "nå", + "når", + "ned", + "no", + "noe", + "noen", + "noka", + "noko", + "nokon", + "nokor", + "nokre", + "og", + "også", + "om", + "opp", + "oss", + "over", + "på", + "så", + "samme", + "sånn", + "seg", + "selv", + "si", + "sia", + "sidan", + "siden", + "sin", + "sine", + "sitt", + "sjøl", + "skal", + "skulle", + "slik", + "so", + "som", + "somme", + "somt", + "til", + "um", + "upp", + "ut", + "uten", + "var", + "vår", + "være", + "vart", + "vært", + "varte", + "ved", + "vere", + "verte", + "vi", + "vil", + "ville", + "vore", + "vors", + "vort", +}; + +static PORTUGUESE: Set<&'static str> = phf_set! { + "a", + "à", + "ao", + "aos", + "aquela", + "aquelas", + "aquele", + "aqueles", + "aquilo", + "as", + "às", + "até", + "com", + "como", + "da", + "das", + "de", + "dela", + "delas", + "dele", + "deles", + "depois", + "do", + "dos", + "e", + "ela", + "elas", + "ele", + "eles", + "em", + "entre", + "era", + "eram", + "éramos", + "essa", + "essas", + "esse", + "esses", + "esta", + "está", + "estamos", + "estão", + "estas", + "estava", + "estavam", + "estávamos", + "este", + "esteja", + "estejam", + "estejamos", + "estes", + "esteve", + "estive", + "estivemos", + "estiver", + "estivera", + "estiveram", + "estivéramos", + "estiverem", + "estivermos", + "estivesse", + "estivessem", + "estivéssemos", + "estou", + "eu", + "foi", + "fomos", + "for", + "fora", + "foram", + "fôramos", + "forem", + "formos", + "fosse", + "fossem", + "fôssemos", + "fui", + "há", + "haja", + "hajam", + "hajamos", + "hão", + "havemos", + "hei", + "houve", + "houvemos", + "houver", + "houvera", + "houverá", + "houveram", + "houvéramos", + "houverão", + "houverei", + "houverem", + "houveremos", + "houveria", + "houveriam", + "houveríamos", + "houvermos", + "houvesse", + "houvessem", + "houvéssemos", + "isso", + "isto", + "já", + "lhe", + "lhes", + "mais", + "mas", + "me", + "mesmo", + "meu", + "meus", + "minha", + "minhas", + "muito", + "na", + "não", + "nas", + "nem", + "no", + "nos", + "nós", + "nossa", + "nossas", + "nosso", + "nossos", + "num", + "numa", + "o", + "os", + "ou", + "para", + "pela", + "pelas", + "pelo", + "pelos", + "por", + "qual", + "quando", + "que", + "quem", + "são", + "se", + "seja", + "sejam", + "sejamos", + "sem", + "será", + "serão", + "serei", + "seremos", + "seria", + "seriam", + "seríamos", + "seu", + "seus", + "só", + "somos", + "sou", + "sua", + "suas", + "também", + "te", + "tem", + "tém", + "temos", + "tenha", + "tenham", + "tenhamos", + "tenho", + "terá", + "terão", + "terei", + "teremos", + "teria", + "teriam", + "teríamos", + "teu", + "teus", + "teve", + "tinha", + "tinham", + "tínhamos", + "tive", + "tivemos", + "tiver", + "tivera", + "tiveram", + "tivéramos", + "tiverem", + "tivermos", + "tivesse", + "tivessem", + "tivéssemos", + "tu", + "tua", + "tuas", + "um", + "uma", + "você", + "vocês", + "vos", +}; + +static ROMANIAN: Set<&'static str> = phf_set! { + "a", + "abia", + "acea", + "aceasta", + "această", + "aceea", + "aceeasi", + "acei", + "aceia", + "acel", + "acela", + "acelasi", + "acele", + "acelea", + "acest", + "acesta", + "aceste", + "acestea", + "acestei", + "acestia", + "acestui", + "aceşti", + "aceştia", + "adica", + "ai", + "aia", + "aibă", + "aici", + "al", + "ala", + "ale", + "alea", + "alt", + "alta", + "altceva", + "altcineva", + "alte", + "altfel", + "alti", + "altii", + "altul", + "am", + "anume", + "apoi", + "ar", + "are", + "as", + "asa", + "asta", + "astea", + "astfel", + "asupra", + "atare", + "atat", + "atata", + "atatea", + "atatia", + "ati", + "atit", + "atita", + "atitea", + "atitia", + "atunci", + "au", + "avea", + "avem", + "aveţi", + "avut", + "aş", + "aţi", + "ba", + "ca", + "cam", + "cand", + "care", + "careia", + "carora", + "caruia", + "cat", + "cât", + "câte", + "catre", + "câtva", + "câţi", + "ce", + "cea", + "ceea", + "cei", + "ceilalti", + "cel", + "cele", + "celor", + "ceva", + "chiar", + "ci", + "cind", + "cînd", + "cine", + "cineva", + "cit", + "cît", + "cita", + "cite", + "cîte", + "citeva", + "citi", + "citiva", + "cîtva", + "cîţi", + "cu", + "cui", + "cum", + "cumva", + "că", + "căci", + "cărei", + "căror", + "cărui", + "către", + "da", + "daca", + "dacă", + "dar", + "dat", + "dată", + "dau", + "de", + "deasupra", + "deci", + "decit", + "deja", + "desi", + "despre", + "deşi", + "din", + "dintr", + "dintr-", + "dintre", + "doar", + "doi", + "doilea", + "două", + "drept", + "dupa", + "după", + "dă", + "e", + "ea", + "ei", + "el", + "ele", + "era", + "eram", + "este", + "eu", + "eşti", + "face", + "fara", + "fata", + "fel", + "fi", + "fie", + "fiecare", + "fii", + "fim", + "fiu", + "fiţi", + "foarte", + "fost", + "fără", + "i", + "ia", + "iar", + "ii", + "îi", + "il", + "îl", + "imi", + "îmi", + "in", + "în", + "inainte", + "inapoi", + "inca", + "incit", + "insa", + "intr", + "intre", + "isi", + "iti", + "îţi", + "la", + "lângă", + "le", + "li", + "lîngă", + "lor", + "lui", + "m", + "ma", + "mai", + "mâine", + "mea", + "mei", + "mele", + "mereu", + "meu", + "mi", + "mie", + "mîine", + "mine", + "mod", + "mult", + "multa", + "multe", + "multi", + "multă", + "mulţi", + "mă", + "ne", + "ni", + "nici", + "nimeni", + "nimic", + "niste", + "nişte", + "noastre", + "noastră", + "noi", + "nostri", + "nostru", + "nou", + "noua", + "nouă", + "noştri", + "nu", + "numai", + "o", + "or", + "ori", + "oricând", + "oricare", + "oricât", + "orice", + "oricînd", + "oricine", + "oricît", + "oricum", + "oriunde", + "pai", + "până", + "parca", + "patra", + "patru", + "pe", + "pentru", + "peste", + "pic", + "pina", + "pînă", + "poate", + "pot", + "prea", + "prima", + "primul", + "prin", + "printr-", + "putini", + "puţin", + "puţina", + "puţină", + "sa", + "sa-mi", + "sa-ti", + "sai", + "sale", + "sau", + "se", + "si", + "sint", + "sintem", + "spate", + "spre", + "sub", + "sunt", + "suntem", + "sunteţi", + "sus", + "să", + "săi", + "său", + "t", + "ta", + "tale", + "te", + "ti", + "tine", + "toata", + "toate", + "toată", + "tocmai", + "tot", + "toti", + "totul", + "totusi", + "totuşi", + "toţi", + "trei", + "treia", + "treilea", + "tu", + "tuturor", + "tăi", + "tău", + "u", + "ul", + "ului", + "un", + "una", + "unde", + "undeva", + "unei", + "uneia", + "unele", + "uneori", + "unii", + "unor", + "unora", + "unu", + "unui", + "unuia", + "unul", + "v", + "va", + "vi", + "voastre", + "voastră", + "voi", + "vom", + "vor", + "vostru", + "vouă", + "voştri", + "vreo", + "vreun", + "vă", + "zi", + "zice", + "şi", + "ţi", + "ţie", + "ăla", + "ălea", + "ăsta", + "ăstea", + "ăştia", +}; + +static RUSSIAN: Set<&'static str> = phf_set! { + "а", + "в", + "ж", + "и", + "к", + "о", + "с", + "у", + "я", + "бы", + "во", + "вы", + "да", + "до", + "ее", + "ей", + "же", + "за", + "из", + "им", + "их", + "ли", + "мы", + "на", + "не", + "ни", + "но", + "ну", + "об", + "он", + "от", + "по", + "со", + "то", + "ты", + "уж", + "без", + "был", + "вам", + "вас", + "вот", + "все", + "всю", + "где", + "два", + "для", + "его", + "ему", + "еще", + "или", + "как", + "кто", + "мне", + "мой", + "моя", + "над", + "нас", + "нее", + "ней", + "нет", + "ним", + "них", + "она", + "они", + "под", + "при", + "про", + "раз", + "сам", + "так", + "там", + "тем", + "том", + "тот", + "три", + "тут", + "уже", + "чем", + "что", + "эти", + "эту", + "была", + "были", + "было", + "быть", + "ведь", + "всех", + "даже", + "если", + "есть", + "куда", + "меня", + "надо", + "него", + "один", + "свою", + "себе", + "себя", + "тебя", + "того", + "тоже", + "хоть", + "чего", + "чтоб", + "чуть", + "этой", + "этом", + "этот", + "более", + "будет", + "будто", + "вдруг", + "всего", + "зачем", + "здесь", + "какая", + "какой", + "когда", + "лучше", + "между", + "много", + "может", + "можно", + "опять", + "перед", + "после", + "потом", + "почти", + "разве", + "такой", + "тогда", + "через", + "чтобы", + "этого", + "больше", + "всегда", + "другой", + "иногда", + "нельзя", + "нибудь", + "ничего", + "потому", + "сейчас", + "совсем", + "теперь", + "только", + "хорошо", + "впрочем", + "конечно", + "наконец", + "никогда", +}; + +static SPANISH: Set<&'static str> = phf_set! { + "a", + "al", + "algo", + "algunas", + "algunos", + "ante", + "antes", + "como", + "con", + "contra", + "cual", + "cuando", + "de", + "del", + "desde", + "donde", + "durante", + "e", + "el", + "él", + "ella", + "ellas", + "ellos", + "en", + "entre", + "era", + "erais", + "éramos", + "eran", + "eras", + "eres", + "es", + "esa", + "esas", + "ese", + "eso", + "esos", + "esta", + "está", + "estaba", + "estabais", + "estábamos", + "estaban", + "estabas", + "estad", + "estada", + "estadas", + "estado", + "estados", + "estáis", + "estamos", + "están", + "estando", + "estar", + "estará", + "estarán", + "estarás", + "estaré", + "estaréis", + "estaremos", + "estaría", + "estaríais", + "estaríamos", + "estarían", + "estarías", + "estas", + "estás", + "este", + "esté", + "estéis", + "estemos", + "estén", + "estés", + "esto", + "estos", + "estoy", + "estuve", + "estuviera", + "estuvierais", + "estuviéramos", + "estuvieran", + "estuvieras", + "estuvieron", + "estuviese", + "estuvieseis", + "estuviésemos", + "estuviesen", + "estuvieses", + "estuvimos", + "estuviste", + "estuvisteis", + "estuvo", + "fue", + "fuera", + "fuerais", + "fuéramos", + "fueran", + "fueras", + "fueron", + "fuese", + "fueseis", + "fuésemos", + "fuesen", + "fueses", + "fui", + "fuimos", + "fuiste", + "fuisteis", + "ha", + "habéis", + "había", + "habíais", + "habíamos", + "habían", + "habías", + "habida", + "habidas", + "habido", + "habidos", + "habiendo", + "habrá", + "habrán", + "habrás", + "habré", + "habréis", + "habremos", + "habría", + "habríais", + "habríamos", + "habrían", + "habrías", + "han", + "has", + "hasta", + "hay", + "haya", + "hayáis", + "hayamos", + "hayan", + "hayas", + "he", + "hemos", + "hube", + "hubiera", + "hubierais", + "hubiéramos", + "hubieran", + "hubieras", + "hubieron", + "hubiese", + "hubieseis", + "hubiésemos", + "hubiesen", + "hubieses", + "hubimos", + "hubiste", + "hubisteis", + "hubo", + "la", + "las", + "le", + "les", + "lo", + "los", + "más", + "me", + "mi", + "mí", + "mía", + "mías", + "mío", + "míos", + "mis", + "mucho", + "muchos", + "muy", + "nada", + "ni", + "no", + "nos", + "nosotras", + "nosotros", + "nuestra", + "nuestras", + "nuestro", + "nuestros", + "o", + "os", + "otra", + "otras", + "otro", + "otros", + "para", + "pero", + "poco", + "por", + "porque", + "que", + "qué", + "quien", + "quienes", + "se", + "sea", + "seáis", + "seamos", + "sean", + "seas", + "sentid", + "sentida", + "sentidas", + "sentido", + "sentidos", + "será", + "serán", + "serás", + "seré", + "seréis", + "seremos", + "sería", + "seríais", + "seríamos", + "serían", + "serías", + "sí", + "siente", + "sin", + "sintiendo", + "sobre", + "sois", + "somos", + "son", + "soy", + "su", + "sus", + "suya", + "suyas", + "suyo", + "suyos", + "también", + "tanto", + "te", + "tendrá", + "tendrán", + "tendrás", + "tendré", + "tendréis", + "tendremos", + "tendría", + "tendríais", + "tendríamos", + "tendrían", + "tendrías", + "tened", + "tenéis", + "tenemos", + "tenga", + "tengáis", + "tengamos", + "tengan", + "tengas", + "tengo", + "tenía", + "teníais", + "teníamos", + "tenían", + "tenías", + "tenida", + "tenidas", + "tenido", + "tenidos", + "teniendo", + "ti", + "tiene", + "tienen", + "tienes", + "todo", + "todos", + "tu", + "tú", + "tus", + "tuve", + "tuviera", + "tuvierais", + "tuviéramos", + "tuvieran", + "tuvieras", + "tuvieron", + "tuviese", + "tuvieseis", + "tuviésemos", + "tuviesen", + "tuvieses", + "tuvimos", + "tuviste", + "tuvisteis", + "tuvo", + "tuya", + "tuyas", + "tuyo", + "tuyos", + "un", + "una", + "uno", + "unos", + "vosostras", + "vosostros", + "vuestra", + "vuestras", + "vuestro", + "vuestros", + "y", + "ya", + "yo", +}; + +static SWEDISH: Set<&'static str> = phf_set! { + "alla", + "allt", + "än", + "är", + "åt", + "att", + "av", + "blev", + "bli", + "blir", + "blivit", + "då", + "där", + "de", + "dem", + "den", + "denna", + "deras", + "dess", + "dessa", + "det", + "detta", + "dig", + "din", + "dina", + "ditt", + "du", + "efter", + "ej", + "eller", + "en", + "er", + "era", + "ert", + "ett", + "för", + "från", + "ha", + "hade", + "han", + "hans", + "har", + "här", + "henne", + "hennes", + "hon", + "honom", + "hur", + "i", + "icke", + "ingen", + "inom", + "inte", + "jag", + "ju", + "kan", + "kunde", + "man", + "med", + "mellan", + "men", + "mig", + "min", + "mina", + "mitt", + "mot", + "mycket", + "någon", + "något", + "några", + "när", + "ni", + "nu", + "och", + "om", + "oss", + "över", + "på", + "så", + "sådan", + "sådana", + "sådant", + "samma", + "sedan", + "sig", + "sin", + "sina", + "sitta", + "själv", + "skulle", + "som", + "till", + "under", + "upp", + "ut", + "utan", + "vad", + "var", + "vår", + "vara", + "våra", + "varför", + "varit", + "varje", + "vars", + "vart", + "vårt", + "vem", + "vi", + "vid", + "vilka", + "vilkas", + "vilken", + "vilket", +}; + +static TURKISH: Set<&'static str> = phf_set! { + "acaba", + "ama", + "aslında", + "az", + "bazı", + "belki", + "biri", + "birkaç", + "birşey", + "biz", + "bu", + "çok", + "çünkü", + "da", + "daha", + "de", + "defa", + "diye", + "en", + "eğer", + "gibi", + "hem", + "hep", + "hepsi", + "her", + "hiç", + "için", + "ile", + "ise", + "kez", + "ki", + "kim", + "mu", + "mü", + "mı", + "nasıl", + "ne", + "neden", + "nerde", + "nerede", + "nereye", + "niçin", + "niye", + "o", + "sanki", + "siz", + "tüm", + "ve", + "veya", + "ya", + "yani", + "şey", + "şu", +}; diff --git a/crates/nlp/src/lib.rs b/crates/nlp/src/lib.rs index d933ea0e..a5c174e8 100644 --- a/crates/nlp/src/lib.rs +++ b/crates/nlp/src/lib.rs @@ -1,59 +1,52 @@ -use ahash::AHashSet; - pub mod bayes; pub mod language; pub mod tokenizers; -pub mod transformers; - -#[derive(Debug, Clone, Default)] -pub struct PublicSuffix { - pub suffixes: AHashSet, - pub exceptions: AHashSet, - pub wildcards: Vec, -} - -impl PublicSuffix { - pub fn contains(&self, suffix: &str) -> bool { - self.suffixes.contains(suffix) - || (!self.exceptions.contains(suffix) - && self.wildcards.iter().any(|w| suffix.ends_with(w))) - } -} #[cfg(test)] mod test { use std::fs; + use utils::suffixlist::PublicSuffix; + use crate::{ - bayes::{bloom::BloomHasher, BayesClassifier, BayesModel}, - transformers::osb::{OsbToken, OsbTokenizer}, + bayes::{tokenize::BayesTokenizer, BayesClassifier, BayesModel}, + tokenizers::osb::{OsbToken, OsbTokenizer}, }; #[test] #[ignore] fn train() { - let db = fs::read_to_string("spam_or_not_spam.csv").unwrap(); + let db = + fs::read_to_string("/Users/me/code/mail-server/_ignore/spam_or_not_spam.csv").unwrap(); let mut bayes = BayesModel::default(); + let suffixes = PublicSuffix::default(); for line in db.lines() { let (text, is_spam) = line.rsplit_once(',').unwrap(); let is_spam = is_spam == "1"; bayes.train( - BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)), + OsbTokenizer::new(BayesTokenizer::new(text, &suffixes), 5), is_spam, ); } println!("Ham: {} Spam: {}", bayes.ham_learns, bayes.spam_learns,); - fs::write("spam_or_not_spam.bin", bincode::serialize(&bayes).unwrap()).unwrap(); + fs::write( + "/Users/me/code/mail-server/_ignore/spam_or_not_spam.bin", + bincode::serialize(&bayes).unwrap(), + ) + .unwrap(); } #[test] #[ignore] fn classify() { - let model: BayesModel = - bincode::deserialize(&fs::read("spam_or_not_spam.bin").unwrap()).unwrap(); + let model: BayesModel = bincode::deserialize( + &fs::read("/Users/me/code/mail-server/_ignore/spam_or_not_spam.bin").unwrap(), + ) + .unwrap(); let bayes = BayesClassifier::new(); + let suffixes = PublicSuffix::default(); for text in [ "i am attaching to this email a presentation to integrate the spreadsheet into our server", @@ -65,7 +58,7 @@ mod test { "{:?} -> {}", text, bayes - .classify(BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)).filter_map(|x| model.weights.get(&x.inner).map(|w| { + .classify(OsbTokenizer::new(BayesTokenizer::new(text, &suffixes), 5).filter_map(|x| model.weights.get(&x.inner).map(|w| { OsbToken { idx: x.idx, inner: *w, diff --git a/crates/nlp/src/tokenizers/chinese.rs b/crates/nlp/src/tokenizers/chinese.rs index f9ff355b..4ba246cb 100644 --- a/crates/nlp/src/tokenizers/chinese.rs +++ b/crates/nlp/src/tokenizers/chinese.rs @@ -29,7 +29,7 @@ use super::{InnerToken, Token}; use lazy_static::lazy_static; lazy_static! { - static ref JIEBA: Jieba = Jieba::new(); + pub static ref JIEBA: Jieba = Jieba::new(); } pub struct ChineseTokenizer<'x, T, I> diff --git a/crates/nlp/src/tokenizers/mod.rs b/crates/nlp/src/tokenizers/mod.rs index a3e42d47..1fb50901 100644 --- a/crates/nlp/src/tokenizers/mod.rs +++ b/crates/nlp/src/tokenizers/mod.rs @@ -23,6 +23,7 @@ pub mod chinese; pub mod japanese; +pub mod osb; pub mod space; pub mod types; pub mod word; diff --git a/crates/nlp/src/tokenizers/osb.rs b/crates/nlp/src/tokenizers/osb.rs new file mode 100644 index 00000000..c646a3a4 --- /dev/null +++ b/crates/nlp/src/tokenizers/osb.rs @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::{borrow::Cow, iter::Peekable}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OsbToken { + pub inner: T, + pub idx: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Gram<'x> { + Uni { t1: &'x str }, + Bi { t1: &'x str, t2: &'x str }, +} + +pub struct OsbTokenizer<'x, I, R> +where + I: Iterator>, + R: for<'y> From> + 'static, +{ + iter: Peekable, + buf: Vec>>, + window_size: usize, + window_pos: usize, + window_idx: usize, + phantom: std::marker::PhantomData, +} + +impl<'x, I, R> OsbTokenizer<'x, I, R> +where + I: Iterator>, + R: for<'y> From> + 'static, +{ + pub fn new(iter: I, window_size: usize) -> Self { + Self { + iter: iter.peekable(), + buf: vec![None; window_size], + window_pos: 0, + window_idx: 0, + window_size, + phantom: std::marker::PhantomData, + } + } +} + +impl<'x, I, R> Iterator for OsbTokenizer<'x, I, R> +where + I: Iterator>, + R: for<'y> From> + 'static, +{ + type Item = OsbToken; + + fn next(&mut self) -> Option { + let end_pos = (self.window_pos + self.window_idx) % self.window_size; + if self.buf[end_pos].is_none() { + self.buf[end_pos] = self.iter.next(); + } + + let t1 = self.buf[self.window_pos % self.window_size].as_deref()?; + let token = OsbToken { + inner: R::from(if self.window_idx != 0 { + Gram::Bi { + t1, + t2: self.buf[end_pos].as_deref()?, + } + } else { + Gram::Uni { t1 } + }), + idx: self.window_idx, + }; + + // Increment window index + self.window_idx += 1; + if self.window_idx == self.window_size + || (self.iter.peek().is_none() + && self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none()) + { + self.buf[self.window_pos % self.window_size] = None; + self.window_idx = 0; + self.window_pos += 1; + } + + Some(token) + } +} + +#[cfg(test)] +mod test { + use std::borrow::Cow; + + use crate::tokenizers::osb::{Gram, OsbToken}; + + impl From> for String { + fn from(value: Gram<'_>) -> Self { + match value { + Gram::Uni { t1 } => t1.to_string(), + Gram::Bi { t1, t2 } => format!("{t1} {t2}"), + } + } + } + + #[test] + fn osb_tokenizer() { + assert_eq!( + super::OsbTokenizer::new( + "The quick brown fox jumps over the lazy dog and the lazy cat" + .split_ascii_whitespace() + .map(Cow::from), + 5, + ) + .collect::>(), + vec![ + OsbToken { + inner: "The".to_string(), + idx: 0 + }, + OsbToken { + inner: "The quick".to_string(), + idx: 1 + }, + OsbToken { + inner: "The brown".to_string(), + idx: 2 + }, + OsbToken { + inner: "The fox".to_string(), + idx: 3 + }, + OsbToken { + inner: "The jumps".to_string(), + idx: 4 + }, + OsbToken { + inner: "quick".to_string(), + idx: 0 + }, + OsbToken { + inner: "quick brown".to_string(), + idx: 1 + }, + OsbToken { + inner: "quick fox".to_string(), + idx: 2 + }, + OsbToken { + inner: "quick jumps".to_string(), + idx: 3 + }, + OsbToken { + inner: "quick over".to_string(), + idx: 4 + }, + OsbToken { + inner: "brown".to_string(), + idx: 0 + }, + OsbToken { + inner: "brown fox".to_string(), + idx: 1 + }, + OsbToken { + inner: "brown jumps".to_string(), + idx: 2 + }, + OsbToken { + inner: "brown over".to_string(), + idx: 3 + }, + OsbToken { + inner: "brown the".to_string(), + idx: 4 + }, + OsbToken { + inner: "fox".to_string(), + idx: 0 + }, + OsbToken { + inner: "fox jumps".to_string(), + idx: 1 + }, + OsbToken { + inner: "fox over".to_string(), + idx: 2 + }, + OsbToken { + inner: "fox the".to_string(), + idx: 3 + }, + OsbToken { + inner: "fox lazy".to_string(), + idx: 4 + }, + OsbToken { + inner: "jumps".to_string(), + idx: 0 + }, + OsbToken { + inner: "jumps over".to_string(), + idx: 1 + }, + OsbToken { + inner: "jumps the".to_string(), + idx: 2 + }, + OsbToken { + inner: "jumps lazy".to_string(), + idx: 3 + }, + OsbToken { + inner: "jumps dog".to_string(), + idx: 4 + }, + OsbToken { + inner: "over".to_string(), + idx: 0 + }, + OsbToken { + inner: "over the".to_string(), + idx: 1 + }, + OsbToken { + inner: "over lazy".to_string(), + idx: 2 + }, + OsbToken { + inner: "over dog".to_string(), + idx: 3 + }, + OsbToken { + inner: "over and".to_string(), + idx: 4 + }, + OsbToken { + inner: "the".to_string(), + idx: 0 + }, + OsbToken { + inner: "the lazy".to_string(), + idx: 1 + }, + OsbToken { + inner: "the dog".to_string(), + idx: 2 + }, + OsbToken { + inner: "the and".to_string(), + idx: 3 + }, + OsbToken { + inner: "the the".to_string(), + idx: 4 + }, + OsbToken { + inner: "lazy".to_string(), + idx: 0 + }, + OsbToken { + inner: "lazy dog".to_string(), + idx: 1 + }, + OsbToken { + inner: "lazy and".to_string(), + idx: 2 + }, + OsbToken { + inner: "lazy the".to_string(), + idx: 3 + }, + OsbToken { + inner: "lazy lazy".to_string(), + idx: 4 + }, + OsbToken { + inner: "dog".to_string(), + idx: 0 + }, + OsbToken { + inner: "dog and".to_string(), + idx: 1 + }, + OsbToken { + inner: "dog the".to_string(), + idx: 2 + }, + OsbToken { + inner: "dog lazy".to_string(), + idx: 3 + }, + OsbToken { + inner: "dog cat".to_string(), + idx: 4 + }, + OsbToken { + inner: "and".to_string(), + idx: 0 + }, + OsbToken { + inner: "and the".to_string(), + idx: 1 + }, + OsbToken { + inner: "and lazy".to_string(), + idx: 2 + }, + OsbToken { + inner: "and cat".to_string(), + idx: 3 + }, + OsbToken { + inner: "the".to_string(), + idx: 0 + }, + OsbToken { + inner: "the lazy".to_string(), + idx: 1 + }, + OsbToken { + inner: "the cat".to_string(), + idx: 2 + }, + OsbToken { + inner: "lazy".to_string(), + idx: 0 + }, + OsbToken { + inner: "lazy cat".to_string(), + idx: 1 + }, + OsbToken { + inner: "cat".to_string(), + idx: 0 + } + ] + ); + } +} diff --git a/crates/nlp/src/tokenizers/types.rs b/crates/nlp/src/tokenizers/types.rs index 97e9bccb..027cf767 100644 --- a/crates/nlp/src/tokenizers/types.rs +++ b/crates/nlp/src/tokenizers/types.rs @@ -23,7 +23,7 @@ use std::str::CharIndices; -use crate::PublicSuffix; +use utils::suffixlist::PublicSuffix; use super::Token; @@ -31,35 +31,39 @@ pub struct TypesTokenizer<'x, 'y> { text: &'x str, suffixes: &'y PublicSuffix, iter: CharIndices<'x>, - tokens: Vec>>, + tokens: Vec>>, peek_pos: usize, last_ch_is_space: bool, last_token_is_dot: bool, eof: bool, + tokenize_urls: bool, + tokenize_urls_without_scheme: bool, + tokenize_emails: bool, + tokenize_numbers: bool, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TokenType<'x> { - Alphabetic(&'x str), - Integer(&'x str), - Alphanumeric(&'x str), - Hexadecimal(&'x str), +pub enum TokenType { + Alphabetic(T), + Integer(T), + Alphanumeric(T), + Hexadecimal(T), Other(char), Punctuation(char), Space, // Detected types - Url(&'x str), - UrlNoScheme(&'x str), - UrlNoHost(&'x str), - Email(&'x str), - Float(&'x str), + Url(T), + UrlNoScheme(T), + UrlNoHost(T), + Email(T), + Float(T), } -impl Copy for Token> {} +impl Copy for Token> {} impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> { - type Item = Token>; + type Item = Token>; fn next(&mut self) -> Option { let token = self.peek()?; @@ -67,7 +71,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> { self.last_token_is_dot = matches!(token.word, TokenType::Punctuation('.')); // Try parsing URL with scheme - if matches!( + if self.tokenize_urls + && matches!( token.word, TokenType::Alphabetic(t) | TokenType::Hexadecimal(t) if t.len() <= 8 && t.chars().all(|c| c.is_ascii())) @@ -82,7 +87,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> { } // Try parsing email - if token.word.is_email_atom() + if self.tokenize_emails + && token.word.is_email_atom() && self.peek_has_tokens( &[TokenType::Punctuation('@'), TokenType::Punctuation('.')], TokenType::Space, @@ -97,7 +103,8 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> { } // Try parsing URL without scheme - if token.word.is_domain_atom(true) + if self.tokenize_urls_without_scheme + && token.word.is_domain_atom(true) && self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space) { if let Some(url) = self.try_parse_url(None) { @@ -109,7 +116,7 @@ impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> { } // Try parsing currencies and floating point numbers - if !last_is_dot { + if self.tokenize_numbers && !last_is_dot { if let Some(num) = self.try_parse_number() { self.peek_advance(); return Some(num); @@ -132,9 +139,33 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> { suffixes, last_ch_is_space: false, last_token_is_dot: false, + tokenize_urls: true, + tokenize_urls_without_scheme: true, + tokenize_emails: true, + tokenize_numbers: true, } } + pub fn tokenize_urls(mut self, tokenize: bool) -> Self { + self.tokenize_urls = tokenize; + self + } + + pub fn tokenize_urls_without_scheme(mut self, tokenize: bool) -> Self { + self.tokenize_urls_without_scheme = tokenize; + self + } + + pub fn tokenize_emails(mut self, tokenize: bool) -> Self { + self.tokenize_emails = tokenize; + self + } + + pub fn tokenize_numbers(mut self, tokenize: bool) -> Self { + self.tokenize_numbers = tokenize; + self + } + fn consume(&mut self) -> bool { let mut has_alpha = false; let mut has_number = false; @@ -212,7 +243,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> { } } - fn next_(&mut self) -> Option>> { + fn next_(&mut self) -> Option>> { if self.tokens.is_empty() && !self.eof { self.consume(); } @@ -223,7 +254,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> { } } - fn peek(&mut self) -> Option>> { + fn peek(&mut self) -> Option>> { while self.tokens.len() <= self.peek_pos && !self.eof { self.consume(); } @@ -244,7 +275,11 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> { self.peek_pos = 0; } - fn peek_has_tokens(&mut self, tokens: &[TokenType<'_>], stop_token: TokenType<'_>) -> bool { + fn peek_has_tokens( + &mut self, + tokens: &[TokenType<&'_ str>], + stop_token: TokenType<&'_ str>, + ) -> bool { let mut tokens = tokens.iter().copied(); let mut token = tokens.next().unwrap(); while let Some(t) = self.peek() { @@ -266,8 +301,8 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> { fn try_parse_url( &mut self, - scheme_token: Option>>, - ) -> Option>> { + scheme_token: Option>>, + ) -> Option>> { let (has_scheme, allow_blank_host) = scheme_token.as_ref().map_or((false, false), |t| { ( true, @@ -480,7 +515,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> { .into() } - fn try_parse_email(&mut self) -> Option>> { + fn try_parse_email(&mut self) -> Option>> { // Start token is a valid local part atom let start_token = self.peek()?; let mut last_is_dot = false; @@ -615,7 +650,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> { None } - fn try_parse_number(&mut self) -> Option>> { + fn try_parse_number(&mut self) -> Option>> { self.peek_rewind(); let mut start_pos = usize::MAX; let mut end_pos = usize::MAX; @@ -698,7 +733,7 @@ impl<'x, 'y> TypesTokenizer<'x, 'y> { } } -impl<'x> TokenType<'x> { +impl TokenType { fn is_email_atom(&self) -> bool { matches!( self, @@ -744,7 +779,8 @@ impl<'x> TokenType<'x> { #[cfg(test)] mod test { - use crate::PublicSuffix; + + use utils::suffixlist::PublicSuffix; use super::{TokenType, TypesTokenizer}; diff --git a/crates/nlp/src/transformers/mod.rs b/crates/nlp/src/transformers/mod.rs deleted file mode 100644 index 1d2d365b..00000000 --- a/crates/nlp/src/transformers/mod.rs +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2023 Stalwart Labs Ltd. - * - * This file is part of the Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -pub mod osb; diff --git a/crates/nlp/src/transformers/osb.rs b/crates/nlp/src/transformers/osb.rs deleted file mode 100644 index 0c87132d..00000000 --- a/crates/nlp/src/transformers/osb.rs +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright (c) 2023 Stalwart Labs Ltd. - * - * This file is part of the Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::iter::Peekable; - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct OsbToken { - pub inner: T, - pub idx: usize, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum Gram<'x> { - Uni { t1: &'x str }, - Bi { t1: &'x str, t2: &'x str }, -} - -pub struct OsbTokenizer<'x, I> -where - I: Iterator, -{ - iter: Peekable, - buf: Vec>, - window_size: usize, - window_pos: usize, - window_idx: usize, -} - -impl<'x, I> OsbTokenizer<'x, I> -where - I: Iterator, -{ - pub fn new(iter: I, window_size: usize) -> Self { - Self { - iter: iter.peekable(), - buf: vec![None; window_size], - window_pos: 0, - window_idx: 0, - window_size, - } - } -} - -impl<'x, I> Iterator for OsbTokenizer<'x, I> -where - I: Iterator, -{ - type Item = OsbToken>; - - fn next(&mut self) -> Option { - let end_pos = (self.window_pos + self.window_idx) % self.window_size; - if self.buf[end_pos].is_none() { - self.buf[end_pos] = self.iter.next(); - } - - let t1 = self.buf[self.window_pos % self.window_size]?; - let token = OsbToken { - inner: if self.window_idx != 0 { - Gram::Bi { - t1, - t2: self.buf[end_pos]?, - } - } else { - Gram::Uni { t1 } - }, - idx: self.window_idx, - }; - - // Increment window - self.window_idx += 1; - if self.window_idx == self.window_size - || (self.iter.peek().is_none() - && self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none()) - { - self.buf[self.window_pos % self.window_size] = None; - self.window_idx = 0; - self.window_pos += 1; - } - - Some(token) - } -} - -#[cfg(test)] -mod test { - use crate::transformers::osb::{Gram, OsbToken}; - - #[test] - fn osb_tokenizer() { - assert_eq!( - super::OsbTokenizer::new( - "The quick brown fox jumps over the lazy dog and the lazy cat" - .split_ascii_whitespace(), - 5 - ) - .collect::>(), - vec![ - OsbToken { - inner: Gram::Uni { t1: "The" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "The", - t2: "quick" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "The", - t2: "brown" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "The", - t2: "fox" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "The", - t2: "jumps" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "quick" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "quick", - t2: "brown" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "quick", - t2: "fox" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "quick", - t2: "jumps" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "quick", - t2: "over" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "brown" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "brown", - t2: "fox" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "brown", - t2: "jumps" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "brown", - t2: "over" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "brown", - t2: "the" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "fox" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "fox", - t2: "jumps" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "fox", - t2: "over" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "fox", - t2: "the" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "fox", - t2: "lazy" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "jumps" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "jumps", - t2: "over" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "jumps", - t2: "the" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "jumps", - t2: "lazy" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "jumps", - t2: "dog" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "over" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "over", - t2: "the" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "over", - t2: "lazy" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "over", - t2: "dog" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "over", - t2: "and" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "the" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "the", - t2: "lazy" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "the", - t2: "dog" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "the", - t2: "and" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "the", - t2: "the" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "lazy" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "lazy", - t2: "dog" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "lazy", - t2: "and" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "lazy", - t2: "the" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "lazy", - t2: "lazy" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "dog" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "dog", - t2: "and" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "dog", - t2: "the" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "dog", - t2: "lazy" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Bi { - t1: "dog", - t2: "cat" - }, - idx: 4 - }, - OsbToken { - inner: Gram::Uni { t1: "and" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "and", - t2: "the" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "and", - t2: "lazy" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Bi { - t1: "and", - t2: "cat" - }, - idx: 3 - }, - OsbToken { - inner: Gram::Uni { t1: "the" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "the", - t2: "lazy" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Bi { - t1: "the", - t2: "cat" - }, - idx: 2 - }, - OsbToken { - inner: Gram::Uni { t1: "lazy" }, - idx: 0 - }, - OsbToken { - inner: Gram::Bi { - t1: "lazy", - t2: "cat" - }, - idx: 1 - }, - OsbToken { - inner: Gram::Uni { t1: "cat" }, - idx: 0 - } - ] - ); - } -} diff --git a/crates/smtp/Cargo.toml b/crates/smtp/Cargo.toml index f9fce2ab..dc8f846f 100644 --- a/crates/smtp/Cargo.toml +++ b/crates/smtp/Cargo.toml @@ -13,6 +13,7 @@ resolver = "2" [dependencies] utils = { path = "../utils" } +nlp = { path = "../nlp" } directory = { path = "../directory" } mail-auth = { git = "https://github.com/stalwartlabs/mail-auth" } mail-send = { git = "https://github.com/stalwartlabs/mail-send", default-features = false, features = ["cram-md5", "skip-ehlo"] } @@ -50,7 +51,6 @@ num_cpus = "1.15.0" lazy_static = "1.4" whatlang = "0.16" imagesize = "0.12" -linkify = "0.10" idna = "0.4" decancer = "1.6.1" unicode-security = "0.1.0" diff --git a/crates/smtp/src/config/mod.rs b/crates/smtp/src/config/mod.rs index fd1f5753..13c4b1fc 100644 --- a/crates/smtp/src/config/mod.rs +++ b/crates/smtp/src/config/mod.rs @@ -39,7 +39,7 @@ use std::{ time::Duration, }; -use ahash::{AHashMap, AHashSet}; +use ahash::AHashMap; use directory::{Directory, DirectoryConfig, Lookup}; use mail_auth::{ common::crypto::{Ed25519Key, RsaKey, Sha256}, @@ -541,13 +541,6 @@ pub enum VerifyStrategy { Disable, } -#[derive(Debug, Clone, Default)] -pub struct PublicSuffix { - pub suffixes: AHashSet, - pub exceptions: AHashSet, - pub wildcards: Vec, -} - #[derive(Default)] pub struct ConfigContext<'x> { pub servers: &'x [Server], diff --git a/crates/smtp/src/config/resolver.rs b/crates/smtp/src/config/resolver.rs index a28471df..4d2e03a9 100644 --- a/crates/smtp/src/config/resolver.rs +++ b/crates/smtp/src/config/resolver.rs @@ -34,9 +34,7 @@ use mail_auth::{ }; use crate::{core::Resolvers, outbound::dane::DnssecResolver}; -use utils::config::Config; - -use super::PublicSuffix; +use utils::{config::Config, suffixlist::PublicSuffix}; pub trait ConfigResolver { fn build_resolvers(&self) -> super::Result; @@ -108,9 +106,9 @@ impl ConfigResolver for Config { } fn parse_public_suffix(&self) -> super::Result { - let mut ps = PublicSuffix::default(); - + let mut has_values = false; for (_, value) in self.values("resolver.public-suffix") { + has_values = true; let bytes = if value.starts_with("https://") || value.starts_with("http://") { match tokio::task::block_in_place(|| { reqwest::blocking::get(value).and_then(|r| { @@ -175,20 +173,7 @@ impl ConfigResolver for Config { match String::from_utf8(bytes) { Ok(list) => { - for line in list.lines() { - let line = line.trim().to_lowercase(); - if !line.starts_with("//") { - if let Some(domain) = line.strip_prefix('*') { - ps.wildcards.push(domain.to_string()); - } else if let Some(domain) = line.strip_prefix('!') { - ps.exceptions.insert(domain.to_string()); - } else { - ps.suffixes.insert(line.to_string()); - } - } - } - - return Ok(ps); + return Ok(PublicSuffix::from(list.as_str())); } Err(err) => { tracing::warn!( @@ -200,16 +185,10 @@ impl ConfigResolver for Config { } } - tracing::warn!("Failed to parse public suffixes from any source."); + if has_values { + tracing::warn!("Failed to parse public suffixes from any source."); + } - Ok(ps) - } -} - -impl PublicSuffix { - pub fn contains(&self, suffix: &str) -> bool { - self.suffixes.contains(suffix) - || (!self.exceptions.contains(suffix) - && self.wildcards.iter().any(|w| suffix.ends_with(w))) + Ok(PublicSuffix::default()) } } diff --git a/crates/smtp/src/config/scripts.rs b/crates/smtp/src/config/scripts.rs index 38c73a04..c841e2b6 100644 --- a/crates/smtp/src/config/scripts.rs +++ b/crates/smtp/src/config/scripts.rs @@ -21,25 +21,33 @@ * for more details. */ -use std::time::Duration; +use std::{sync::Arc, time::Duration}; +use directory::Lookup; +use nlp::bayes::{cache::BayesTokenCache, BayesClassifier}; use sieve::{compiler::grammar::Capability, Compiler, Runtime}; use crate::{ core::{SieveConfig, SieveCore}, scripts::{functions::register_functions, plugins::RegisterSievePlugins}, }; -use utils::config::{utils::AsKey, Config}; +use utils::{ + config::{utils::AsKey, Config}, + suffixlist::PublicSuffix, +}; -use super::{resolver::ConfigResolver, ConfigContext, PublicSuffix}; +use super::{resolver::ConfigResolver, ConfigContext}; pub trait ConfigSieve { fn parse_sieve(&self, ctx: &mut ConfigContext) -> super::Result; } -#[derive(Clone, Default)] pub struct SieveContext { pub psl: PublicSuffix, + pub bayes_classify: BayesClassifier, + pub bayes_cache: BayesTokenCache, + pub lookup_classify: Arc, + pub lookup_train: Arc, } impl ConfigSieve for Config { @@ -48,6 +56,29 @@ impl ConfigSieve for Config { let mut fnc_map = register_functions().register_plugins(); let sieve_ctx = SieveContext { psl: self.parse_public_suffix()?, + bayes_classify: BayesClassifier { + min_token_hits: self.property_or_static("bayes.min-token-hits", "2")?, + min_tokens: self.property_or_static("bayes.min-tokens", "11")?, + min_prob_strength: self.property_or_static("bayes.min-prob-strength", "0.05")?, + min_learns: self.property_or_static("bayes.min-learns", "200")?, + }, + bayes_cache: BayesTokenCache::new( + self.property_or_static("bayes.cache.capacity", "8192")?, + self.property_or_static("bayes.cache.ttl.positive", "1h")?, + self.property_or_static("bayes.cache.ttl.negative", "1h")?, + ), + lookup_classify: ctx + .directory + .lookups + .get("bayes.tokens.classify") + .ok_or("No lookup found for key bayes.tokens.classify.".to_string())? + .clone(), + lookup_train: ctx + .directory + .lookups + .get("bayes.tokens.train") + .ok_or("No lookup found for key bayes.tokens.train.".to_string())? + .clone(), }; // Allocate compiler and runtime diff --git a/crates/smtp/src/scripts/event_loop.rs b/crates/smtp/src/scripts/event_loop.rs index 6edcc9e3..e49575c1 100644 --- a/crates/smtp/src/scripts/event_loop.rs +++ b/crates/smtp/src/scripts/event_loop.rs @@ -24,7 +24,6 @@ use core::panic; use std::{sync::Arc, time::Duration}; -use ahash::AHashMap; use directory::Lookup; use mail_auth::common::headers::HeaderWriter; use sieve::{ @@ -68,8 +67,6 @@ impl SMTP { let mut modifications = vec![]; let mut keep_id = usize::MAX; - let mut plugin_data = AHashMap::new(); - // Start event loop while let Some(result) = instance.run(input) { match result { @@ -125,7 +122,6 @@ impl SMTP { span: &span, handle: &handle, core: self, - data: &mut plugin_data, message: instance.message(), arguments, }, diff --git a/crates/smtp/src/scripts/functions/text.rs b/crates/smtp/src/scripts/functions/text.rs index 56ebaa65..1fc06be3 100644 --- a/crates/smtp/src/scripts/functions/text.rs +++ b/crates/smtp/src/scripts/functions/text.rs @@ -21,11 +21,12 @@ * for more details. */ +use nlp::tokenizers::types::{TokenType, TypesTokenizer}; use sieve::{runtime::Variable, Context}; -use crate::{config::scripts::SieveContext, scripts::functions::url::tokenize_email}; +use crate::config::scripts::SieveContext; -use super::{html::html_to_tokens, url::tokenize_url, ApplyString}; +use super::{html::html_to_tokens, ApplyString}; pub fn fn_trim<'x>(_: &'x Context<'x, SieveContext>, v: Vec>) -> Variable<'x> { v[0].transform(|s| Variable::StringRef(s.trim())) @@ -106,13 +107,49 @@ pub fn fn_tokenize<'x>( ctx: &'x Context<'x, SieveContext>, mut v: Vec>, ) -> Variable<'x> { - match v[1].to_cow().as_ref() { - "html" => html_to_tokens(v[0].to_cow().as_ref()).into(), - "words" => tokenize_words(&v[0]), - "uri" | "url" => tokenize_url(ctx, v.remove(0), false), - "uri_strict" | "url_strict" => tokenize_url(ctx, v.remove(0), true), - "email" => tokenize_email(v.remove(0)), - _ => Variable::default(), + let (urls, urls_without_scheme, emails) = match v[1].to_cow().as_ref() { + "html" => return html_to_tokens(v[0].to_cow().as_ref()).into(), + "words" => return tokenize_words(&v[0]), + "uri" | "url" => (true, true, true), + "uri_strict" | "url_strict" => (true, false, false), + "email" => (false, false, true), + _ => return Variable::default(), + }; + + match v.remove(0) { + Variable::StringRef(text) => TypesTokenizer::new(text, &ctx.context().psl) + .tokenize_numbers(false) + .tokenize_urls(urls) + .tokenize_urls_without_scheme(urls_without_scheme) + .tokenize_emails(emails) + .filter_map(|t| match t.word { + TokenType::Url(text) if urls => Variable::StringRef(text).into(), + TokenType::UrlNoScheme(text) if urls_without_scheme => { + Variable::String(format!("https://{text}")).into() + } + TokenType::Email(text) if emails => Variable::StringRef(text).into(), + _ => None, + }) + .collect::>() + .into(), + v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => { + TypesTokenizer::new(v.to_cow().as_ref(), &ctx.context().psl) + .tokenize_numbers(false) + .tokenize_urls(urls) + .tokenize_urls_without_scheme(urls_without_scheme) + .tokenize_emails(emails) + .filter_map(|t| match t.word { + TokenType::Url(text) if urls => Variable::String(text.to_string()).into(), + TokenType::UrlNoScheme(text) if urls_without_scheme => { + Variable::String(format!("https://{text}")).into() + } + TokenType::Email(text) if emails => Variable::String(text.to_string()).into(), + _ => None, + }) + .collect::>() + .into() + } + v => v, } } diff --git a/crates/smtp/src/scripts/functions/url.rs b/crates/smtp/src/scripts/functions/url.rs index aaf86036..9ecbfd5a 100644 --- a/crates/smtp/src/scripts/functions/url.rs +++ b/crates/smtp/src/scripts/functions/url.rs @@ -21,94 +21,13 @@ * for more details. */ -use std::net::IpAddr; - use hyper::Uri; -use linkify::LinkKind; use sieve::{runtime::Variable, Context}; use crate::config::scripts::SieveContext; use super::ApplyString; -pub fn tokenize_url<'x>( - ctx: &'x Context<'x, SieveContext>, - v: Variable<'x>, - must_have_scheme: bool, -) -> Variable<'x> { - match v { - Variable::StringRef(text) => linkify::LinkFinder::new() - .url_must_have_scheme(must_have_scheme) - .kinds(&[LinkKind::Url]) - .links(text.as_ref()) - .filter_map(|url| filter_url(url.as_str(), must_have_scheme, ctx)) - .collect::>() - .into(), - v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => { - linkify::LinkFinder::new() - .url_must_have_scheme(must_have_scheme) - .kinds(&[LinkKind::Url]) - .links(v.to_cow().as_ref()) - .filter_map(|url| { - filter_url(url.as_str(), must_have_scheme, ctx).map(|v| v.into_owned()) - }) - .collect::>() - .into() - } - v => v, - } -} - -pub fn tokenize_email(v: Variable<'_>) -> Variable<'_> { - match v { - Variable::StringRef(text) => linkify::LinkFinder::new() - .email_domain_must_have_dot(true) - .kinds(&[LinkKind::Email]) - .links(text.as_ref()) - .map(|email| Variable::StringRef(email.as_str())) - .collect::>() - .into(), - v @ (Variable::String(_) | Variable::Array(_) | Variable::ArrayRef(_)) => { - linkify::LinkFinder::new() - .email_domain_must_have_dot(true) - .kinds(&[LinkKind::Email]) - .links(v.to_cow().as_ref()) - .map(|email| Variable::String(email.as_str().to_string())) - .collect::>() - .into() - } - v => v, - } -} - -fn filter_url<'x, 'y>( - url: &'x str, - must_have_scheme: bool, - ctx: &'y Context<'y, SieveContext>, -) -> Option> { - if must_have_scheme || url.contains("://") { - Some(Variable::StringRef(url)) - } else { - // Filter out possible URLs without a valid TLD - let host = url.split_once('/').map_or(url, |(f, _)| f); - if (host - .as_bytes() - .first() - .map_or(true, |ch| ch.is_ascii_hexdigit()) - && host.parse::().is_ok()) - || ctx - .context() - .psl - .contains(host.rsplit_once('.').map_or(host, |(_, tld)| tld)) - || host.ends_with(".onion") - { - Some(Variable::String(format!("https://{url}"))) - } else { - None - } - } -} - pub fn fn_uri_part<'x>(_: &'x Context<'x, SieveContext>, v: Vec>) -> Variable<'x> { let part = v[1].to_cow(); v[0].transform(|uri| { diff --git a/crates/smtp/src/scripts/plugins/bayes.rs b/crates/smtp/src/scripts/plugins/bayes.rs new file mode 100644 index 00000000..e14f6b37 --- /dev/null +++ b/crates/smtp/src/scripts/plugins/bayes.rs @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use directory::{DatabaseColumn, Lookup}; +use nlp::{ + bayes::{cache::BayesTokenCache, tokenize::BayesTokenizer, BayesModel, TokenHash, Weights}, + tokenizers::osb::{OsbToken, OsbTokenizer}, +}; +use sieve::{runtime::Variable, FunctionMap}; +use tokio::runtime::Handle; + +use crate::config::scripts::SieveContext; + +use super::PluginContext; + +pub fn register_train(plugin_id: u32, fnc_map: &mut FunctionMap) { + fnc_map.set_external_function("bayes_train", plugin_id, 2); +} + +pub fn register_untrain(plugin_id: u32, fnc_map: &mut FunctionMap) { + fnc_map.set_external_function("bayes_untrain", plugin_id, 2); +} + +pub fn register_classify(plugin_id: u32, fnc_map: &mut FunctionMap) { + fnc_map.set_external_function("bayes_classify", plugin_id, 1); +} + +pub fn exec_train(ctx: PluginContext<'_>) -> Variable<'static> { + train(ctx, true) +} + +pub fn exec_untrain(ctx: PluginContext<'_>) -> Variable<'static> { + train(ctx, false) +} + +fn train(ctx: PluginContext<'_>, is_train: bool) -> Variable<'static> { + let mut arguments = ctx.arguments.into_iter(); + let text = arguments.next().unwrap().into_string(); + if text.is_empty() { + return false.into(); + } + let handle = ctx.handle; + let ctx = ctx.core.sieve.runtime.context(); + + // Train the model + let is_spam = arguments.next().unwrap().to_bool(); + let mut model = BayesModel::default(); + model.train( + OsbTokenizer::new(BayesTokenizer::new(text.as_ref(), &ctx.psl), 5), + is_spam, + ); + if model.weights.is_empty() { + return false.into(); + } + + // Update weight and invalidate cache + let upsert = &ctx.lookup_train; + for (hash, weights) in model.weights { + let (s_weight, h_weight) = if is_train { + (weights.spam as i64, weights.ham as i64) + } else { + (-(weights.spam as i64), -(weights.ham as i64)) + }; + if handle + .block_on(upsert.lookup(&[ + hash.h1.into(), + hash.h2.into(), + s_weight.into(), + h_weight.into(), + ])) + .is_none() + { + return false.into(); + } + ctx.bayes_cache.invalidate(&hash); + } + + // Update training counts + let train_val = if is_train { 1i64 } else { -1i64 }; + let (spam_count, ham_count) = if is_spam { + (train_val, 0i64) + } else { + (0i64, train_val) + }; + if handle + .block_on(upsert.query(&[ + 0i64.into(), + 0i64.into(), + spam_count.into(), + ham_count.into(), + ])) + .is_none() + { + return false.into(); + } + ctx.bayes_cache.invalidate(&TokenHash::default()); + + true.into() +} + +pub fn exec_classify(ctx: PluginContext<'_>) -> Variable<'static> { + let mut arguments = ctx.arguments.into_iter(); + let text = arguments.next().unwrap().into_string(); + if text.is_empty() { + return 0.into(); + } + let handle = ctx.handle; + let ctx = ctx.core.sieve.runtime.context(); + let get_token = &ctx.lookup_classify; + + // Obtain training counts + let (spam_learns, ham_learns) = if let Some(weights) = + ctx.bayes_cache + .get_or_update(TokenHash::default(), handle, get_token) + { + (weights.spam, weights.ham) + } else { + return 0.into(); + }; + + // Make sure we have enough training data + if spam_learns < ctx.bayes_classify.min_learns || ham_learns < ctx.bayes_classify.min_learns { + return 0.into(); + } + + // Classify the text + ctx.bayes_classify + .classify( + OsbTokenizer::<_, TokenHash>::new(BayesTokenizer::new(text.as_ref(), &ctx.psl), 5) + .filter_map(|t| { + OsbToken { + inner: ctx.bayes_cache.get_or_update(t.inner, handle, get_token)?, + idx: t.idx, + } + .into() + }), + ham_learns, + spam_learns, + ) + .unwrap_or_default() + .into() +} + +trait LookupOrInsert { + fn get_or_update( + &self, + hash: TokenHash, + handle: &Handle, + get_token: &Lookup, + ) -> Option; +} + +impl LookupOrInsert for BayesTokenCache { + fn get_or_update( + &self, + hash: TokenHash, + handle: &Handle, + get_token: &Lookup, + ) -> Option { + if let Some(weights) = self.get(&hash) { + weights.unwrap_or_default().into() + } else if let Some(result) = + handle.block_on(get_token.query(&[hash.h1.into(), hash.h2.into()])) + { + let mut result = result.into_iter(); + match (result.next(), result.next()) { + (Some(DatabaseColumn::Integer(spam)), Some(DatabaseColumn::Integer(ham))) => { + let weights = Weights { + spam: spam as u32, + ham: ham as u32, + }; + self.insert_positive(hash, weights); + weights + } + _ => { + self.insert_negative(hash); + Weights::default() + } + } + .into() + } else { + // Something went wrong + None + } + } +} diff --git a/crates/smtp/src/scripts/plugins/lookup.rs b/crates/smtp/src/scripts/plugins/lookup.rs index 3a677349..706cbaed 100644 --- a/crates/smtp/src/scripts/plugins/lookup.rs +++ b/crates/smtp/src/scripts/plugins/lookup.rs @@ -21,6 +21,7 @@ * for more details. */ +use directory::DatabaseColumn; use sieve::{runtime::Variable, FunctionMap}; use crate::config::scripts::SieveContext; @@ -62,15 +63,20 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> { } pub fn exec_map(ctx: PluginContext<'_>) -> Variable<'static> { - let lookup_id = ctx.arguments[0].to_cow(); - let item = ctx.arguments[1].to_cow(); + let mut arguments = ctx.arguments.into_iter(); + let lookup_id = arguments.next().unwrap().into_cow(); + let items = match arguments.next().unwrap() { + Variable::Array(l) => l.into_iter().map(DatabaseColumn::from).collect(), + Variable::ArrayRef(l) => l.iter().map(DatabaseColumn::from).collect(), + v => vec![DatabaseColumn::from(v)], + }; let span = ctx.span; - if !lookup_id.is_empty() && !item.is_empty() { + if !lookup_id.is_empty() && !items.is_empty() { if let Some(lookup) = ctx.core.sieve.lookup.get(lookup_id.as_ref()) { return ctx .handle - .block_on(lookup.lookup(item.as_ref())) + .block_on(lookup.lookup(&items)) .unwrap_or_default(); } else { tracing::warn!( diff --git a/crates/smtp/src/scripts/plugins/mod.rs b/crates/smtp/src/scripts/plugins/mod.rs index e5b99452..5909654b 100644 --- a/crates/smtp/src/scripts/plugins/mod.rs +++ b/crates/smtp/src/scripts/plugins/mod.rs @@ -21,13 +21,13 @@ * for more details. */ +pub mod bayes; pub mod dns; pub mod exec; pub mod http; pub mod lookup; pub mod query; -use ahash::AHashMap; use mail_parser::Message; use sieve::{runtime::Variable, FunctionMap, Input}; use tokio::runtime::Handle; @@ -41,12 +41,11 @@ pub struct PluginContext<'x> { pub span: &'x tracing::Span, pub handle: &'x Handle, pub core: &'x SMTP, - pub data: &'x mut AHashMap, pub message: &'x Message<'x>, pub arguments: Vec>, } -const PLUGINS_EXEC: [ExecPluginFnc; 7] = [ +const PLUGINS_EXEC: [ExecPluginFnc; 10] = [ query::exec, exec::exec, lookup::exec, @@ -54,8 +53,11 @@ const PLUGINS_EXEC: [ExecPluginFnc; 7] = [ dns::exec, dns::exec_exists, http::exec_header, + bayes::exec_train, + bayes::exec_untrain, + bayes::exec_classify, ]; -const PLUGINS_REGISTER: [RegisterPluginFnc; 7] = [ +const PLUGINS_REGISTER: [RegisterPluginFnc; 10] = [ query::register, exec::register, lookup::register, @@ -63,6 +65,9 @@ const PLUGINS_REGISTER: [RegisterPluginFnc; 7] = [ dns::register, dns::register_exists, http::register_header, + bayes::register_train, + bayes::register_untrain, + bayes::register_classify, ]; pub trait RegisterSievePlugins { diff --git a/crates/smtp/src/scripts/plugins/query.rs b/crates/smtp/src/scripts/plugins/query.rs index 60f674a0..60ad0255 100644 --- a/crates/smtp/src/scripts/plugins/query.rs +++ b/crates/smtp/src/scripts/plugins/query.rs @@ -22,7 +22,7 @@ */ use crate::config::scripts::SieveContext; -use directory::QueryColumn; +use directory::DatabaseColumn; use sieve::{runtime::Variable, FunctionMap}; use super::PluginContext; @@ -62,8 +62,12 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> { return false.into(); } - // Obtain parameters - let parameters = arguments.next().unwrap().into_string_array(); + // Obtain arguments + let arguments = match arguments.next().unwrap() { + Variable::Array(l) => l.into_iter().map(DatabaseColumn::from).collect(), + Variable::ArrayRef(l) => l.iter().map(DatabaseColumn::from).collect(), + v => vec![DatabaseColumn::from(v)], + }; // Run query if query @@ -71,12 +75,9 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> { .get(..6) .map_or(false, |q| q.eq_ignore_ascii_case(b"SELECT")) { - if let Ok(mut query_columns) = ctx.handle.block_on(directory.query( - &query, - ¶meters.iter().map(String::as_str).collect::>(), - )) { + if let Ok(mut query_columns) = ctx.handle.block_on(directory.query(&query, &arguments)) { match query_columns.len() { - 1 if !matches!(query_columns.first(), Some(QueryColumn::Null)) => { + 1 if !matches!(query_columns.first(), Some(DatabaseColumn::Null)) => { query_columns.pop().map(Variable::from).unwrap() } 0 => Variable::default(), @@ -87,10 +88,7 @@ pub fn exec(ctx: PluginContext<'_>) -> Variable<'static> { } } else { ctx.handle - .block_on(directory.lookup( - &query, - ¶meters.iter().map(String::as_str).collect::>(), - )) + .block_on(directory.lookup(&query, &arguments)) .is_ok() .into() } diff --git a/crates/utils/src/config/utils.rs b/crates/utils/src/config/utils.rs index ab95862e..38fe31d3 100644 --- a/crates/utils/src/config/utils.rs +++ b/crates/utils/src/config/utils.rs @@ -298,6 +298,18 @@ impl ParseValue for u64 { } } +impl ParseValue for f64 { + fn parse_value(key: impl AsKey, value: &str) -> super::Result { + value.parse().map_err(|_| { + format!( + "Invalid floating point value {:?} for property {:?}.", + value, + key.as_key() + ) + }) + } +} + impl ParseValue for u16 { fn parse_value(key: impl AsKey, value: &str) -> super::Result { value.parse().map_err(|_| { diff --git a/crates/utils/src/lib.rs b/crates/utils/src/lib.rs index 413b0f76..7e3cf76a 100644 --- a/crates/utils/src/lib.rs +++ b/crates/utils/src/lib.rs @@ -30,6 +30,7 @@ pub mod config; pub mod ipc; pub mod listener; pub mod map; +pub mod suffixlist; use opentelemetry::{ sdk::{ diff --git a/crates/utils/src/suffixlist.rs b/crates/utils/src/suffixlist.rs new file mode 100644 index 00000000..59ebd6c7 --- /dev/null +++ b/crates/utils/src/suffixlist.rs @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use ahash::AHashSet; + +#[derive(Debug, Clone, Default)] +pub struct PublicSuffix { + pub suffixes: AHashSet, + pub exceptions: AHashSet, + pub wildcards: Vec, +} + +impl PublicSuffix { + pub fn contains(&self, suffix: &str) -> bool { + self.suffixes.contains(suffix) + || (!self.exceptions.contains(suffix) + && self.wildcards.iter().any(|w| suffix.ends_with(w))) + } +} + +impl From<&str> for PublicSuffix { + fn from(list: &str) -> Self { + let mut ps = PublicSuffix::default(); + for line in list.lines() { + let line = line.trim().to_lowercase(); + if !line.starts_with("//") { + if let Some(domain) = line.strip_prefix('*') { + ps.wildcards.push(domain.to_string()); + } else if let Some(domain) = line.strip_prefix('!') { + ps.exceptions.insert(domain.to_string()); + } else { + ps.suffixes.insert(line.to_string()); + } + } + } + ps.suffixes.insert("onion".to_string()); + ps + } +} diff --git a/resources/config/sieve/headers.sieve b/resources/config/sieve/headers.sieve index 74ed94b6..5ce51161 100644 --- a/resources/config/sieve/headers.sieve +++ b/resources/config/sieve/headers.sieve @@ -1,6 +1,5 @@ # Mailing list scores let "ml_score" "count(header.List-Id:List-Archive:List-Owner:List-Help:List-Post:X-Loop:List-Subscribe:List-Unsubscribe[*].exists) * 0.125"; -eval "print('ml_score: ' + ml_score)"; if eval "ml_score < 1" { if eval "header.List-Id.exists" { let "ml_score" "ml_score + 0.50";