From 3d9efd363a3ed1fab306e3f5e9fea41bfe58d8be Mon Sep 17 00:00:00 2001 From: mdecimus Date: Tue, 10 Oct 2023 18:58:38 +0200 Subject: [PATCH] Bayes classifier, type tokenizer and NLP module reorganization --- CHANGELOG.md | 3 +- Cargo.lock | 116 +- Cargo.toml | 2 +- README.md | 1 + crates/antispam/Cargo.toml | 7 - crates/antispam/src/main.rs | 64 - .../src/modules/antispam}/mod.rs | 0 .../src/modules/antispam}/spamassassin.rs | 0 .../src/modules/antispam}/tokenizer.rs | 0 .../src/modules/antispam}/utils.rs | 0 crates/imap/Cargo.toml | 1 + crates/imap/src/op/search.rs | 3 +- crates/jmap/Cargo.toml | 1 + crates/jmap/src/api/config.rs | 6 +- crates/jmap/src/email/index.rs | 6 +- crates/jmap/src/email/query.rs | 3 +- crates/jmap/src/email/snippet.rs | 7 +- crates/jmap/src/lib.rs | 2 +- crates/jmap/src/mailbox/query.rs | 2 +- crates/jmap/src/sieve/query.rs | 6 +- crates/nlp/Cargo.toml | 19 + crates/nlp/src/bayes/bloom.rs | 77 + crates/nlp/src/bayes/classify.rs | 167 + crates/nlp/src/bayes/mod.rs | 75 + crates/nlp/src/bayes/train.rs | 68 + .../lang.rs => nlp/src/language/detect.rs} | 0 crates/nlp/src/language/mod.rs | 202 ++ .../src/fts => nlp/src/language}/stemmer.rs | 16 +- crates/nlp/src/lib.rs | 78 + crates/nlp/src/tokenizers/chinese.rs | 222 ++ crates/nlp/src/tokenizers/japanese.rs | 179 + crates/nlp/src/tokenizers/mod.rs | 74 + .../src/fts => nlp/src}/tokenizers/space.rs | 0 crates/nlp/src/tokenizers/types.rs | 2878 +++++++++++++++++ .../src/tokenizers/word.rs} | 16 +- .../ngram.rs => nlp/src/transformers/mod.rs} | 39 +- crates/nlp/src/transformers/osb.rs | 467 +++ crates/store/Cargo.toml | 7 +- crates/store/src/fts/bloom.rs | 7 +- crates/store/src/fts/builder.rs | 20 +- crates/store/src/fts/mod.rs | 154 +- crates/store/src/fts/query.rs | 8 +- crates/store/src/fts/search_snippet.rs | 10 +- crates/store/src/fts/term_index.rs | 22 +- crates/store/src/fts/tokenizers/chinese.rs | 197 -- crates/store/src/fts/tokenizers/japanese.rs | 168 - crates/store/src/fts/tokenizers/mod.rs | 96 - crates/store/src/fts/tokenizers/word.rs | 80 - crates/store/src/query/filter.rs | 6 +- crates/store/src/query/mod.rs | 5 +- crates/store/src/write/mod.rs | 4 +- tests/Cargo.toml | 1 + tests/src/store/query.rs | 3 +- 53 files changed, 4651 insertions(+), 944 deletions(-) delete mode 100644 crates/antispam/Cargo.toml delete mode 100644 crates/antispam/src/main.rs rename crates/{antispam/src/import => cli/src/modules/antispam}/mod.rs (100%) rename crates/{antispam/src/import => cli/src/modules/antispam}/spamassassin.rs (100%) rename crates/{antispam/src/import => cli/src/modules/antispam}/tokenizer.rs (100%) rename crates/{antispam/src/import => cli/src/modules/antispam}/utils.rs (100%) create mode 100644 crates/nlp/Cargo.toml create mode 100644 crates/nlp/src/bayes/bloom.rs create mode 100644 crates/nlp/src/bayes/classify.rs create mode 100644 crates/nlp/src/bayes/mod.rs create mode 100644 crates/nlp/src/bayes/train.rs rename crates/{store/src/fts/lang.rs => nlp/src/language/detect.rs} (100%) create mode 100644 crates/nlp/src/language/mod.rs rename crates/{store/src/fts => nlp/src/language}/stemmer.rs (93%) create mode 100644 crates/nlp/src/lib.rs create mode 100644 crates/nlp/src/tokenizers/chinese.rs create mode 100644 crates/nlp/src/tokenizers/japanese.rs create mode 100644 crates/nlp/src/tokenizers/mod.rs rename crates/{store/src/fts => nlp/src}/tokenizers/space.rs (100%) create mode 100644 crates/nlp/src/tokenizers/types.rs rename crates/{store/src/fts/tokenizers/indo_european.rs => nlp/src/tokenizers/word.rs} (94%) rename crates/{store/src/fts/ngram.rs => nlp/src/transformers/mod.rs} (53%) create mode 100644 crates/nlp/src/transformers/osb.rs delete mode 100644 crates/store/src/fts/tokenizers/chinese.rs delete mode 100644 crates/store/src/fts/tokenizers/japanese.rs delete mode 100644 crates/store/src/fts/tokenizers/mod.rs delete mode 100644 crates/store/src/fts/tokenizers/word.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 2453db6e..c20ad3de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,8 @@ All notable changes to this project will be documented in this file. This projec ## [0.3.9] - 2023-10-07 ## Added -- Support for reading environment variables from configuration file using the `!ENV_VAR_NAME` special keyword. +- Support for reading environment variables from the configuration file using the `!ENV_VAR_NAME` special keyword. +- Option to disable ANSI color codes in logs. ### Changed - Querying directories from a Sieve script is now done using the `query()` method from `eval`. Your scripts will need to be updated, please refer to the [new syntax](https://stalw.art/docs/smtp/filter/sieve#directory-queries). diff --git a/Cargo.lock b/Cargo.lock index 4c57347e..f86c208d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,13 +169,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "antispam" -version = "0.1.0" -dependencies = [ - "fancy-regex", -] - [[package]] name = "anyhow" version = "1.0.75" @@ -1487,25 +1480,14 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "add4f07d43996f76ef320709726a556a9d4f965d9410d8d0271132d2f8293480" +checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" dependencies = [ - "errno-dragonfly", "libc", "windows-sys 0.48.0", ] -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "etcetera" version = "0.8.0" @@ -2252,6 +2234,7 @@ dependencies = [ "mail-parser", "mail-send", "md5", + "nlp", "parking_lot", "rustls 0.21.7", "rustls-pemfile", @@ -2450,6 +2433,7 @@ dependencies = [ "mail-parser", "mail-send", "mime", + "nlp", "p256", "rand 0.8.5", "rasn", @@ -2510,9 +2494,9 @@ dependencies = [ [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" dependencies = [ "libc", ] @@ -2703,9 +2687,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.8" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "lock_api" @@ -2754,7 +2738,7 @@ dependencies = [ "mail-parser", "parking_lot", "quick-xml 0.30.0", - "ring 0.17.2", + "ring 0.17.3", "rustls-pemfile", "serde", "serde_json", @@ -3001,6 +2985,30 @@ dependencies = [ "pin-utils", ] +[[package]] +name = "nlp" +version = "0.3.9" +dependencies = [ + "ahash 0.8.3", + "bincode", + "farmhash", + "jieba-rs", + "lazy_static", + "nohash", + "rust-stemmers", + "serde", + "siphasher 1.0.0", + "tinysegmenter", + "whatlang", + "xxhash-rust", +] + +[[package]] +name = "nohash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0f889fb66f7acdf83442c35775764b51fed3c606ab9cee51500dbde2cf528ca" + [[package]] name = "nom" version = "7.1.3" @@ -3072,9 +3080,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", "libm", @@ -3476,7 +3484,7 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" dependencies = [ - "siphasher", + "siphasher 0.3.11", ] [[package]] @@ -3485,7 +3493,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" dependencies = [ - "siphasher", + "siphasher 0.3.11", ] [[package]] @@ -3791,9 +3799,9 @@ dependencies = [ [[package]] name = "rasn" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cf5174961dbfd4f03b57e71e5a11b034f564d5f0b133d63e39d703ac3d2876b" +checksum = "4addd1a49756bcb131c2f686c6c833d2b63e4da7a0df07efd8c3de04b7efbdb2" dependencies = [ "arrayvec", "bitvec", @@ -3813,9 +3821,9 @@ dependencies = [ [[package]] name = "rasn-cms" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56517898cf38bb50fdb6479049ed476510bf59ae7d329b35129dc8a8b309697f" +checksum = "e269b4df6eea0f54abd46afacd759b1c13a27e98da98a47ef3c405ef3568b0f5" dependencies = [ "rasn", "rasn-pkix", @@ -3823,9 +3831,9 @@ dependencies = [ [[package]] name = "rasn-derive" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8def4ce07f970be91bad36c3090af419dcd9e696897ada3cf74bd480e0101d61" +checksum = "ba8242a16e3461b81333516ad8457906f52fdf21d087417fb59262c9ab406618" dependencies = [ "either", "itertools 0.10.5", @@ -3838,9 +3846,9 @@ dependencies = [ [[package]] name = "rasn-pkix" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebdeef45b70d4c20ce34725707b2784c761eacaaa4d841eab46f9f9c6dc10dd3" +checksum = "06179c947a63fe9f9f5d73a539dcb13d90c6bdaeb03bd28b90ad796aff9fe6a8" dependencies = [ "rasn", ] @@ -4024,9 +4032,9 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.2" +version = "0.17.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "911b295d2d302948838c8ac142da1ee09fa7863163b44e6715bc9357905878b8" +checksum = "9babe80d5c16becf6594aa32ad2be8fe08498e7ae60b77de8df700e67f191d7e" dependencies = [ "cc", "getrandom 0.2.10", @@ -4198,9 +4206,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.17" +version = "0.38.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7" +checksum = "5a74ee2d7c2581cd139b42447d7d9389b889bdaad3a73f1ebb16f2a3237bb19c" dependencies = [ "bitflags 2.4.0", "errno", @@ -4644,6 +4652,12 @@ version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "siphasher" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe" + [[package]] name = "slab" version = "0.4.9" @@ -5048,10 +5062,10 @@ dependencies = [ "farmhash", "foundationdb", "futures", - "jieba-rs", "lazy_static", "lru-cache", "maybe-async 0.2.7", + "nlp", "num_cpus", "parking_lot", "r2d2", @@ -5061,14 +5075,11 @@ dependencies = [ "rocksdb", "rusqlite", "rust-s3", - "rust-stemmers", "serde", - "siphasher", - "tinysegmenter", + "siphasher 1.0.0", "tokio", "tracing", "utils", - "whatlang", "xxhash-rust", ] @@ -5244,6 +5255,7 @@ dependencies = [ "mail-parser", "mail-send", "managesieve", + "nlp", "num_cpus", "rayon", "reqwest", @@ -5358,9 +5370,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.32.0" +version = "1.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" +checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" dependencies = [ "backtrace", "bytes", @@ -6040,12 +6052,12 @@ dependencies = [ [[package]] name = "webpki" -version = "0.22.2" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f" +checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53" dependencies = [ - "ring 0.16.20", - "untrusted 0.7.1", + "ring 0.17.3", + "untrusted 0.9.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index b9dea6d6..36ca52e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,9 +8,9 @@ members = [ "crates/imap-proto", "crates/smtp", "crates/managesieve", + "crates/nlp", "crates/store", "crates/directory", - "crates/antispam", "crates/utils", "crates/maybe-async", "crates/cli", diff --git a/README.md b/README.md index 04bc9985..0574f2b9 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ Key features: - OAuth 2.0 [authorization code](https://www.rfc-editor.org/rfc/rfc8628) and [device authorization](https://www.rfc-editor.org/rfc/rfc8628) flows. - Access Control Lists (ACLs). - Rate limiting. + - Security audited (read the [report](https://stalw.art/blog/security-audit)). - **Robust and scalable**: - **FoundationDB** or **SQLite** database backends. - **S3-compatible** blob storage support. diff --git a/crates/antispam/Cargo.toml b/crates/antispam/Cargo.toml deleted file mode 100644 index 9bad0181..00000000 --- a/crates/antispam/Cargo.toml +++ /dev/null @@ -1,7 +0,0 @@ -[package] -name = "antispam" -version = "0.1.0" -edition = "2021" - -[dependencies] -fancy-regex = "0.11.0" diff --git a/crates/antispam/src/main.rs b/crates/antispam/src/main.rs deleted file mode 100644 index c83f7f01..00000000 --- a/crates/antispam/src/main.rs +++ /dev/null @@ -1,64 +0,0 @@ -use std::path::PathBuf; - -use import::spamassassin::import_spamassassin; - -pub mod import; - -fn main() { - import_spamassassin( - PathBuf::from("/Users/me/code/mail-server/resources/spamassassin"), - "cf".to_string(), - false, - ); -} - -const _IGNORE: &str = r#" - -[antispam] -required-score = 5 -add-headers = ["X-Spam-Checker-Version: SpamAssassin _VERSION_ (_SUBVERSION_) on _HOSTNAME_", - "X-Spam-Flag: _YESNOCAPS_", "X-Spam-Level: _STARS(*)_", - "X-Spam-Status: _YESNO_, score=_SCORE_ required=_REQD_ tests=_TESTS_ autolearn=_AUTOLEARN_ version=_VERSION_"] -originating-ip-headers = ["X-Yahoo-Post-IP", "X-Originating-IP", "X-Apparently-From", - "X-SenderIP X-AOL-IP", "X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp"] -rewrite-headers = ["Subject: [SPAM] _SUBJECT_"] -redirect-patterns = ["""m'/(?:index.php)?\?.*(?<=[?&])URL=(.*?)(?:$|[&\#])'i""", - """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/url\?.*?(?<=[?&])q=(.*?)(?:$|[&\#])'i""", - """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:site|inurl):(.*?)(?:$|%20|[\s+&\#])'i""", - """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:"|%22)(.*?)(?:$|%22|["\s+&\#])'i""", - """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/translate\?.*?(?<=[?&])u=(.*?)(?:$|[&\#])'i""", - """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/pagead/iclk\?.*?(?<=[?&])adurl=(.*?)(?:$|[&\#])'i""", - """m'^https?:/*(?:\w+\.)?aol\.com/redir\.adp\?.*(?<=[?&])_url=(.*?)(?:$|[&\#])'i""", - """m'^https?/*(?:\w+\.)?facebook\.com/l/;(.*)'i""", - """/^http:\/\/chkpt\.zdnet\.com\/chkpt\/\w+\/(.*)$/i""", - """/^http:\/\/www(?:\d+)?\.nate\.com\/r\/\w+\/(.*)$/i""", - """/^http:\/\/.+\.gov\/(?:.*\/)?externalLink\.jhtml\?.*url=(.*?)(?:&.*)?$/i""", - """/^http:\/\/redir\.internet\.com\/.+?\/.+?\/(.*)$/i""", - """/^http:\/\/(?:.*?\.)?adtech\.de\/.*(?:;|\|)link=(.*?)(?:;|$)/i""", - """m'^http.*?/redirect\.php\?.*(?<=[?&])goto=(.*?)(?:$|[&\#])'i""", - """m'^https?:/*(?:[^/]+\.)?emf\d\.com/r\.cfm.*?&r=(.*)'i""" -] - -[antispam.autolearn] -enable = true -ignore-headers = [ "X-ACL-Warn", "X-Alimail-AntiSpam", "X-Amavis-Modified", "X-Anti*", "X-aol-global-disposition", - "X-ASF-*", "X-Assp-Version", "X-Authority-Analysis", "X-Authvirus", "X-Auto-Response-Suppress", "X-AV-Do-Run", - "X-AV-Status", "X-avast-antispam", "X-Backend", "X-Barracuda*", "X-Bayes*", "X-BitDefender*", "X-BL", "X-Bogosity", - "X-Boxtrapper", "X-Brightmail-Tracker", "X-BTI-AntiSpam", "X-Bugzilla-Version", "X-CanIt*", "X-Clapf-spamicity", - "X-Cloud-Security", "X-CM-Score", "X-CMAE-*", "X-Company", "X-Coremail-Antispam", "X-CRM114-*", "X-CT-Spam", - "X-CTCH-*", "X-Drweb-SpamState", "X-DSPAM*", "X-eavas*", "X-Enigmail-Version", "X-Eset*", "X-Exchange-Antispam-Report", - "X-ExtloopSabreCommercials1", "X-EYOU-SPAMVALUE", "X-FB-OUTBOUND-SPAM", "X-FEAS-SBL", "X-FILTER-SCORE", "X-Forefront*", - "X-Fuglu*", "X-getmail-filter-classifier", "X-GFIME-MASPAM", "X-Gmane-NNTP-Posting-Host", "X-GMX-Anti*", "X-He-Spam", - "X-hMailServer-Spam", "X-IAS", "X-iGspam-global", "X-Injected-Via-Gmane", "X-Interia-Antivirus", "X-IP-Spam-Verdict", - "X-Ironport*", "X-Junk*", "X-KLMS-*", "X-KMail-*", "X-MailCleaner-*", "X-MailFoundry", "X-MDMailLookup-Result", - "X-ME-*", "X-MessageFilter", "X-Microsoft-Antispam", "X-Mlf-Version", "X-MXScan-*", "X-NAI-Spam-*", "X-NetStation-Status", - "X-OVH-SPAM*", "X-PerlMx-*", "X-PFSI-Info", "X-PMX-*", "X-Policy-Service", "X-policyd-weight", "X-PreRBLs", - "X-Probable-Spam", "X-PROLinux-SpamCheck", "X-Proofpoint-*", "x-purgate-*", "X-Qmail-Scanner-*", "X-Quarantine-ID", - "X-RSpam-Report", "X-SA-*", "X-Scanned-by", "X-SmarterMail-CustomSpamHeader", "X-Spam*", "X-SPF-Scan-By", "X-STA-*", - "X-StarScan-Version", "X-SurGATE-Result", "X-SWITCHham-Score", "X-UI-*", "X-Univie*", "X-Virus*", "X-VR-*", - "X-WatchGuard*", "X-Whitelist-Domain", "X-WUM-CCI", "X_CMAE_Category" ] -threshold.ham = 0.1 -threshold.spam = 12.0 - - -"#; diff --git a/crates/antispam/src/import/mod.rs b/crates/cli/src/modules/antispam/mod.rs similarity index 100% rename from crates/antispam/src/import/mod.rs rename to crates/cli/src/modules/antispam/mod.rs diff --git a/crates/antispam/src/import/spamassassin.rs b/crates/cli/src/modules/antispam/spamassassin.rs similarity index 100% rename from crates/antispam/src/import/spamassassin.rs rename to crates/cli/src/modules/antispam/spamassassin.rs diff --git a/crates/antispam/src/import/tokenizer.rs b/crates/cli/src/modules/antispam/tokenizer.rs similarity index 100% rename from crates/antispam/src/import/tokenizer.rs rename to crates/cli/src/modules/antispam/tokenizer.rs diff --git a/crates/antispam/src/import/utils.rs b/crates/cli/src/modules/antispam/utils.rs similarity index 100% rename from crates/antispam/src/import/utils.rs rename to crates/cli/src/modules/antispam/utils.rs diff --git a/crates/imap/Cargo.toml b/crates/imap/Cargo.toml index 9ce4f25e..3e1a97f9 100644 --- a/crates/imap/Cargo.toml +++ b/crates/imap/Cargo.toml @@ -10,6 +10,7 @@ jmap = { path = "../jmap" } jmap_proto = { path = "../jmap-proto" } directory = { path = "../directory" } store = { path = "../store" } +nlp = { path = "../nlp" } utils = { path = "../utils" } mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "ludicrous_mode"] } mail-send = { git = "https://github.com/stalwartlabs/mail-send", default-features = false, features = ["cram-md5", "skip-ehlo"] } diff --git a/crates/imap/src/op/search.rs b/crates/imap/src/op/search.rs index fa67d208..32ff976d 100644 --- a/crates/imap/src/op/search.rs +++ b/crates/imap/src/op/search.rs @@ -34,8 +34,9 @@ use imap_proto::{ use jmap_proto::types::{collection::Collection, id::Id, keyword::Keyword, property::Property}; use mail_parser::HeaderName; +use nlp::language::Language; use store::{ - fts::{builder::MAX_TOKEN_LENGTH, Language}, + fts::builder::MAX_TOKEN_LENGTH, query::{self, log::Query, sort::Pagination, ResultSet}, roaring::RoaringBitmap, write::now, diff --git a/crates/jmap/Cargo.toml b/crates/jmap/Cargo.toml index 1ea05664..fa1de123 100644 --- a/crates/jmap/Cargo.toml +++ b/crates/jmap/Cargo.toml @@ -6,6 +6,7 @@ resolver = "2" [dependencies] store = { path = "../store" } +nlp = { path = "../nlp" } jmap_proto = { path = "../jmap-proto" } smtp = { path = "../smtp" } utils = { path = "../utils" } diff --git a/crates/jmap/src/api/config.rs b/crates/jmap/src/api/config.rs index abed3120..d784a205 100644 --- a/crates/jmap/src/api/config.rs +++ b/crates/jmap/src/api/config.rs @@ -23,10 +23,8 @@ use std::{str::FromStr, time::Duration}; -use store::{ - fts::Language, - rand::{distributions::Alphanumeric, thread_rng, Rng}, -}; +use nlp::language::Language; +use store::rand::{distributions::Alphanumeric, thread_rng, Rng}; use super::session::BaseCapabilities; diff --git a/crates/jmap/src/email/index.rs b/crates/jmap/src/email/index.rs index f5170f1d..fcb5f826 100644 --- a/crates/jmap/src/email/index.rs +++ b/crates/jmap/src/email/index.rs @@ -37,11 +37,9 @@ use mail_parser::{ parsers::{fields::thread::thread_name, preview::preview_text}, Addr, Address, GetHeader, Group, HeaderName, HeaderValue, Message, MessagePart, PartType, }; +use nlp::language::Language; use store::{ - fts::{ - builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH}, - Language, - }, + fts::builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH}, write::{BatchBuilder, IntoOperations, F_BITMAP, F_CLEAR, F_INDEX, F_VALUE}, }; diff --git a/crates/jmap/src/email/query.rs b/crates/jmap/src/email/query.rs index ac3b57e4..96d74eb5 100644 --- a/crates/jmap/src/email/query.rs +++ b/crates/jmap/src/email/query.rs @@ -28,8 +28,9 @@ use jmap_proto::{ types::{acl::Acl, collection::Collection, keyword::Keyword, property::Property}, }; use mail_parser::HeaderName; +use nlp::language::Language; use store::{ - fts::{builder::MAX_TOKEN_LENGTH, Language}, + fts::builder::MAX_TOKEN_LENGTH, query::{self}, roaring::RoaringBitmap, ValueKey, diff --git a/crates/jmap/src/email/snippet.rs b/crates/jmap/src/email/snippet.rs index 0931b82c..6beb2e7c 100644 --- a/crates/jmap/src/email/snippet.rs +++ b/crates/jmap/src/email/snippet.rs @@ -30,14 +30,12 @@ use jmap_proto::{ types::{acl::Acl, collection::Collection}, }; use mail_parser::{decoders::html::html_to_text, MessageParser, PartType}; +use nlp::language::{stemmer::Stemmer, Language}; use store::{ fts::{ builder::MAX_TOKEN_LENGTH, search_snippet::generate_snippet, - stemmer::Stemmer, term_index::{self, TermIndex}, - tokenizers::Tokenizer, - Language, }, BlobKind, }; @@ -66,7 +64,8 @@ impl JMAP { || (text.starts_with('\'') && text.ends_with('\'')) { terms.push( - Tokenizer::new(&text, language, MAX_TOKEN_LENGTH) + language + .tokenize_text(&text, MAX_TOKEN_LENGTH) .map(|token| (token.word.into_owned(), None)) .collect::>(), ); diff --git a/crates/jmap/src/lib.rs b/crates/jmap/src/lib.rs index ae6c3176..6ca93c86 100644 --- a/crates/jmap/src/lib.rs +++ b/crates/jmap/src/lib.rs @@ -40,6 +40,7 @@ use jmap_proto::{ }, types::{collection::Collection, property::Property}, }; +use nlp::language::Language; use services::{ delivery::spawn_delivery_manager, housekeeper::{self, init_housekeeper, spawn_housekeeper}, @@ -47,7 +48,6 @@ use services::{ }; use smtp::core::SMTP; use store::{ - fts::Language, parking_lot::Mutex, query::{sort::Pagination, Comparator, Filter, ResultSet, SortedResultSet}, roaring::RoaringBitmap, diff --git a/crates/jmap/src/mailbox/query.rs b/crates/jmap/src/mailbox/query.rs index 84880059..9f1b81d2 100644 --- a/crates/jmap/src/mailbox/query.rs +++ b/crates/jmap/src/mailbox/query.rs @@ -27,9 +27,9 @@ use jmap_proto::{ object::{mailbox::QueryArguments, Object}, types::{acl::Acl, collection::Collection, property::Property, value::Value}, }; +use nlp::language::Language; use store::{ ahash::{AHashMap, AHashSet}, - fts::Language, query::{self, sort::Pagination}, roaring::RoaringBitmap, }; diff --git a/crates/jmap/src/sieve/query.rs b/crates/jmap/src/sieve/query.rs index 033f0054..7f570160 100644 --- a/crates/jmap/src/sieve/query.rs +++ b/crates/jmap/src/sieve/query.rs @@ -28,10 +28,8 @@ use jmap_proto::{ }, types::{collection::Collection, property::Property}, }; -use store::{ - fts::Language, - query::{self}, -}; +use nlp::language::Language; +use store::query::{self}; use crate::JMAP; diff --git a/crates/nlp/Cargo.toml b/crates/nlp/Cargo.toml new file mode 100644 index 00000000..9db50841 --- /dev/null +++ b/crates/nlp/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "nlp" +version = "0.3.9" +edition = "2021" +resolver = "2" + +[dependencies] +xxhash-rust = { version = "0.8.5", features = ["xxh3"] } +farmhash = "1.1.5" +siphasher = "1.0" +serde = { version = "1.0", features = ["derive"]} +bincode = "1.3.3" +nohash = "0.2.0" +ahash = "0.8.3" +lazy_static = "1.4" +whatlang = "0.16" # Language detection +rust-stemmers = "1.2" # Stemmers +tinysegmenter = "0.1" # Japanese tokenizer +jieba-rs = "0.6" # Chinese stemmer diff --git a/crates/nlp/src/bayes/bloom.rs b/crates/nlp/src/bayes/bloom.rs new file mode 100644 index 00000000..e701bcd6 --- /dev/null +++ b/crates/nlp/src/bayes/bloom.rs @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use nohash::IsEnabled; + +use crate::transformers::osb::{Gram, OsbToken}; + +use super::TokenHash; + +pub struct BloomHasher<'x, T: Iterator>>> { + buf: Vec, + tokens: T, +} + +impl<'x, T: Iterator>>> BloomHasher<'x, T> { + pub fn new(tokens: T) -> Self { + Self { + buf: Vec::with_capacity(64), + tokens, + } + } +} + +impl<'x, T: Iterator>>> Iterator for BloomHasher<'x, T> { + type Item = OsbToken; + + fn next(&mut self) -> Option { + self.tokens.next().map(|token| { + let bytes = match token.inner { + Gram::Uni { t1 } => t1.as_bytes(), + Gram::Bi { t1, t2, .. } => { + self.buf.clear(); + self.buf.extend_from_slice(t1.as_bytes()); + self.buf.push(b' '); + self.buf.extend_from_slice(t2.as_bytes()); + &self.buf + } + }; + + OsbToken { + inner: TokenHash { + h1: xxhash_rust::xxh3::xxh3_64(bytes), + h2: farmhash::hash64(bytes), + }, + idx: token.idx, + } + }) + } +} + +impl std::hash::Hash for TokenHash { + fn hash(&self, state: &mut H) { + state.write_u64(self.h1 ^ self.h2); + } +} + +impl IsEnabled for TokenHash {} diff --git a/crates/nlp/src/bayes/classify.rs b/crates/nlp/src/bayes/classify.rs new file mode 100644 index 00000000..38f5da85 --- /dev/null +++ b/crates/nlp/src/bayes/classify.rs @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use crate::transformers::osb::OsbToken; + +use super::{BayesClassifier, Weights}; + +// Position 0 represents Unigram weights +const FEATURE_WEIGHT: [f64; 8] = [1.0, 3125.0, 256.0, 27.0, 1.0, 0.0, 0.0, 0.0]; + +impl BayesClassifier { + pub fn classify(&self, tokens: T, ham_learns: u32, spam_learns: u32) -> Option + where + T: Iterator>, + { + if self.min_learns > 0 && (spam_learns < self.min_learns || ham_learns < self.min_learns) { + return None; + } + + let mut processed_tokens = 0; + let mut total_spam_prob = 0.0; + let mut total_ham_prob = 0.0; + + for token in tokens { + let weights = token.inner; + let total_count = weights.spam + weights.ham; + + if total_count >= self.min_token_hits { + let total_count = total_count as f64; + let spam_freq = weights.spam as f64 / f64::max(1.0, spam_learns as f64); + let ham_freq = weights.ham as f64 / f64::max(1.0, ham_learns as f64); + let spam_prob = spam_freq / (spam_freq + ham_freq); + let ham_prob = ham_freq / (spam_freq + ham_freq); + + let fw = FEATURE_WEIGHT[token.idx]; + let w = (fw * total_count) / (1.0 + fw * total_count); + let bayes_spam_prob = prob_combine(spam_prob, total_count, w, 0.5); + + if !((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + self.min_prob_strength) + || (bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - self.min_prob_strength)) + { + let bayes_ham_prob = prob_combine(ham_prob, total_count, w, 0.5); + total_spam_prob += bayes_spam_prob.ln(); + total_ham_prob += bayes_ham_prob.ln(); + processed_tokens += 1; + } + } + } + + if processed_tokens == 0 + || self.min_tokens > 0 && processed_tokens < (self.min_tokens as f64 * 0.1) as u32 + { + return None; + } + + let (h, s) = if total_spam_prob > -300.0 && total_ham_prob > -300.0 { + /* Fisher value is low enough to apply inv_chi_square */ + ( + 1.0 - inv_chi_square(total_spam_prob, processed_tokens), + 1.0 - inv_chi_square(total_ham_prob, processed_tokens), + ) + } else { + /* Use naive method */ + if total_spam_prob < total_ham_prob { + let h = (1.0 - (total_spam_prob - total_ham_prob).exp()) + / (1.0 + (total_spam_prob - total_ham_prob).exp()); + (h, 1.0 - h) + } else { + let s = (1.0 - (total_ham_prob - total_spam_prob).exp()) + / (1.0 + (total_ham_prob - total_spam_prob).exp()); + (1.0 - s, s) + } + }; + + let final_prob = if h.is_finite() && s.is_finite() { + (s + 1.0 - h) / 2.0 + } else { + /* + * We have some overflow, hence we need to check which class + * is NaN + */ + + if h.is_finite() { + 1.0 + } else if s.is_finite() { + 0.0 + } else { + 0.5 + } + }; + + if processed_tokens > 0 && (final_prob - 0.5).abs() > 0.05 { + Some(final_prob) + } else { + None + } + } +} + +/** + * Returns probability of chisquare > value with specified number of freedom + * degrees + */ +#[inline(always)] +fn inv_chi_square(value: f64, freedom_deg: u32) -> f64 { + let mut prob = value.exp(); + + if prob.is_finite() { + /* + * m is our confidence in class + * prob is e ^ x (small value since x is normally less than zero + * So we integrate over degrees of freedom and produce the total result + * from 1.0 (no confidence) to 0.0 (full confidence) + */ + + let mut sum = prob; + let m = -value; + + for i in 1..freedom_deg { + prob *= m / i as f64; + sum += prob; + } + + f64::min(1.0, sum) + } else { + /* + * e^x where x is large *NEGATIVE* number is OK, so we have a very strong + * confidence that inv-chi-square is close to zero + */ + + if value < 0.0 { + 0.0 + } else { + 1.0 + } + } +} + +/*#[inline(always)] +fn normalize_probability(x: f64, bias: f64) -> f64 { + ((x - bias) * 2.0).powi(8) +}*/ + +#[inline(always)] +fn prob_combine(prob: f64, cnt: f64, weight: f64, assumed: f64) -> f64 { + ((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt)) +} diff --git a/crates/nlp/src/bayes/mod.rs b/crates/nlp/src/bayes/mod.rs new file mode 100644 index 00000000..3fb419d2 --- /dev/null +++ b/crates/nlp/src/bayes/mod.rs @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::{collections::HashMap, hash::BuildHasherDefault}; + +use nohash::NoHashHasher; +use serde::{Deserialize, Serialize}; + +pub mod bloom; +pub mod classify; +pub mod train; + +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct BayesModel { + pub weights: HashMap>>, + pub spam_learns: u32, + pub ham_learns: u32, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct BayesClassifier { + pub min_token_hits: u32, + pub min_tokens: u32, + pub min_prob_strength: f64, + pub min_learns: u32, +} + +#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone, PartialEq, Eq)] +pub struct TokenHash { + h1: u64, + h2: u64, +} + +#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone)] +pub struct Weights { + spam: u32, + ham: u32, +} + +impl BayesClassifier { + pub fn new() -> Self { + BayesClassifier { + min_token_hits: 2, + min_tokens: 11, + min_prob_strength: 0.05, + min_learns: 200, + } + } +} + +impl Default for BayesClassifier { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/nlp/src/bayes/train.rs b/crates/nlp/src/bayes/train.rs new file mode 100644 index 00000000..7ba0881d --- /dev/null +++ b/crates/nlp/src/bayes/train.rs @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use crate::transformers::osb::OsbToken; + +use super::{BayesModel, TokenHash}; + +impl BayesModel { + pub fn train(&mut self, tokens: T, is_spam: bool) + where + T: IntoIterator>, + { + if is_spam { + self.spam_learns += 1; + } else { + self.ham_learns += 1; + } + + for token in tokens { + let hs = self.weights.entry(token.inner).or_default(); + if is_spam { + hs.spam += 1; + } else { + hs.ham += 1; + } + } + } + + pub fn untrain(&mut self, tokens: T, is_spam: bool) + where + T: IntoIterator>, + { + if is_spam { + self.spam_learns -= 1; + } else { + self.ham_learns -= 1; + } + + for token in tokens { + let hs = self.weights.entry(token.inner).or_default(); + if is_spam { + hs.spam -= 1; + } else { + hs.ham -= 1; + } + } + } +} diff --git a/crates/store/src/fts/lang.rs b/crates/nlp/src/language/detect.rs similarity index 100% rename from crates/store/src/fts/lang.rs rename to crates/nlp/src/language/detect.rs diff --git a/crates/nlp/src/language/mod.rs b/crates/nlp/src/language/mod.rs new file mode 100644 index 00000000..edc87368 --- /dev/null +++ b/crates/nlp/src/language/mod.rs @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::borrow::Cow; + +use crate::tokenizers::{ + chinese::ChineseTokenizer, japanese::JapaneseTokenizer, word::WordTokenizer, Token, +}; + +use self::detect::LanguageDetector; + +pub mod detect; +pub mod stemmer; + +pub type LanguageTokenizer<'x> = Box>> + 'x>; + +impl Language { + pub fn tokenize_text<'x>( + &self, + text: &'x str, + max_token_length: usize, + ) -> LanguageTokenizer<'x> { + match self { + Language::Japanese => Box::new( + JapaneseTokenizer::new(WordTokenizer::new(text, usize::MAX)) + .filter(move |t| t.word.len() <= max_token_length), + ), + Language::Mandarin => Box::new( + ChineseTokenizer::new(WordTokenizer::new(text, usize::MAX)) + .filter(move |t| t.word.len() <= max_token_length), + ), + _ => Box::new(WordTokenizer::new(text, max_token_length)), + } + } +} + +#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)] +pub enum Language { + Esperanto = 0, + English = 1, + Russian = 2, + Mandarin = 3, + Spanish = 4, + Portuguese = 5, + Italian = 6, + Bengali = 7, + French = 8, + German = 9, + Ukrainian = 10, + Georgian = 11, + Arabic = 12, + Hindi = 13, + Japanese = 14, + Hebrew = 15, + Yiddish = 16, + Polish = 17, + Amharic = 18, + Javanese = 19, + Korean = 20, + Bokmal = 21, + Danish = 22, + Swedish = 23, + Finnish = 24, + Turkish = 25, + Dutch = 26, + Hungarian = 27, + Czech = 28, + Greek = 29, + Bulgarian = 30, + Belarusian = 31, + Marathi = 32, + Kannada = 33, + Romanian = 34, + Slovene = 35, + Croatian = 36, + Serbian = 37, + Macedonian = 38, + Lithuanian = 39, + Latvian = 40, + Estonian = 41, + Tamil = 42, + Vietnamese = 43, + Urdu = 44, + Thai = 45, + Gujarati = 46, + Uzbek = 47, + Punjabi = 48, + Azerbaijani = 49, + Indonesian = 50, + Telugu = 51, + Persian = 52, + Malayalam = 53, + Oriya = 54, + Burmese = 55, + Nepali = 56, + Sinhalese = 57, + Khmer = 58, + Turkmen = 59, + Akan = 60, + Zulu = 61, + Shona = 62, + Afrikaans = 63, + Latin = 64, + Slovak = 65, + Catalan = 66, + Tagalog = 67, + Armenian = 68, + Unknown = 69, + None = 70, +} + +impl Language { + pub fn from_iso_639(code: &str) -> Option { + match code.split_once('-').map(|c| c.0).unwrap_or(code) { + "en" => Language::English, + "es" => Language::Spanish, + "pt" => Language::Portuguese, + "it" => Language::Italian, + "fr" => Language::French, + "de" => Language::German, + "ru" => Language::Russian, + "zh" => Language::Mandarin, + "ja" => Language::Japanese, + "ar" => Language::Arabic, + "hi" => Language::Hindi, + "ko" => Language::Korean, + "bn" => Language::Bengali, + "he" => Language::Hebrew, + "ur" => Language::Urdu, + "fa" => Language::Persian, + "ml" => Language::Malayalam, + "or" => Language::Oriya, + "my" => Language::Burmese, + "ne" => Language::Nepali, + "si" => Language::Sinhalese, + "km" => Language::Khmer, + "tk" => Language::Turkmen, + "am" => Language::Amharic, + "az" => Language::Azerbaijani, + "id" => Language::Indonesian, + "te" => Language::Telugu, + "ta" => Language::Tamil, + "vi" => Language::Vietnamese, + "gu" => Language::Gujarati, + "pa" => Language::Punjabi, + "uz" => Language::Uzbek, + "hy" => Language::Armenian, + "ka" => Language::Georgian, + "la" => Language::Latin, + "sl" => Language::Slovene, + "hr" => Language::Croatian, + "sr" => Language::Serbian, + "mk" => Language::Macedonian, + "lt" => Language::Lithuanian, + "lv" => Language::Latvian, + "et" => Language::Estonian, + "tl" => Language::Tagalog, + "af" => Language::Afrikaans, + "zu" => Language::Zulu, + "sn" => Language::Shona, + "ak" => Language::Akan, + _ => return None, + } + .into() + } +} + +impl Language { + pub fn detect(text: String, default: Language) -> (String, Language) { + if let Some((l, t)) = text + .split_once(':') + .and_then(|(l, t)| (Language::from_iso_639(l)?, t).into()) + { + (t.to_string(), l) + } else { + let l = LanguageDetector::detect_single(&text) + .and_then(|(l, c)| if c > 0.3 { Some(l) } else { None }) + .unwrap_or(default); + (text, l) + } + } +} diff --git a/crates/store/src/fts/stemmer.rs b/crates/nlp/src/language/stemmer.rs similarity index 93% rename from crates/store/src/fts/stemmer.rs rename to crates/nlp/src/language/stemmer.rs index aa056d22..cd3da5e2 100644 --- a/crates/store/src/fts/stemmer.rs +++ b/crates/nlp/src/language/stemmer.rs @@ -25,25 +25,25 @@ use std::borrow::Cow; use rust_stemmers::Algorithm; -use super::{tokenizers::Tokenizer, Language}; +use super::{Language, LanguageTokenizer}; #[derive(Debug, PartialEq, Eq)] pub struct StemmedToken<'x> { pub word: Cow<'x, str>, pub stemmed_word: Option>, - pub offset: u32, // Word offset in the text part - pub len: u8, // Word length + pub from: usize, // Word offset in the text part + pub to: usize, // Word length } pub struct Stemmer<'x> { stemmer: Option, - tokenizer: Tokenizer<'x>, + tokenizer: LanguageTokenizer<'x>, } impl<'x> Stemmer<'x> { pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Stemmer<'x> { Stemmer { - tokenizer: Tokenizer::new(text, language, max_token_length), + tokenizer: language.tokenize_text(text, max_token_length), stemmer: STEMMER_MAP[language as usize].map(rust_stemmers::Stemmer::create), } } @@ -57,15 +57,15 @@ impl<'x> Iterator for Stemmer<'x> { Some(StemmedToken { stemmed_word: self.stemmer.as_ref().and_then(|stemmer| { match stemmer.stem(&token.word) { - Cow::Owned(text) if text.len() != token.len as usize || text != token.word => { + Cow::Owned(text) if text.len() != token.word.len() || text != token.word => { Some(text.into()) } _ => None, } }), word: token.word, - offset: token.offset, - len: token.len, + from: token.from, + to: token.to, }) } } diff --git a/crates/nlp/src/lib.rs b/crates/nlp/src/lib.rs new file mode 100644 index 00000000..d933ea0e --- /dev/null +++ b/crates/nlp/src/lib.rs @@ -0,0 +1,78 @@ +use ahash::AHashSet; + +pub mod bayes; +pub mod language; +pub mod tokenizers; +pub mod transformers; + +#[derive(Debug, Clone, Default)] +pub struct PublicSuffix { + pub suffixes: AHashSet, + pub exceptions: AHashSet, + pub wildcards: Vec, +} + +impl PublicSuffix { + pub fn contains(&self, suffix: &str) -> bool { + self.suffixes.contains(suffix) + || (!self.exceptions.contains(suffix) + && self.wildcards.iter().any(|w| suffix.ends_with(w))) + } +} + +#[cfg(test)] +mod test { + use std::fs; + + use crate::{ + bayes::{bloom::BloomHasher, BayesClassifier, BayesModel}, + transformers::osb::{OsbToken, OsbTokenizer}, + }; + + #[test] + #[ignore] + fn train() { + let db = fs::read_to_string("spam_or_not_spam.csv").unwrap(); + let mut bayes = BayesModel::default(); + + for line in db.lines() { + let (text, is_spam) = line.rsplit_once(',').unwrap(); + let is_spam = is_spam == "1"; + + bayes.train( + BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)), + is_spam, + ); + } + println!("Ham: {} Spam: {}", bayes.ham_learns, bayes.spam_learns,); + fs::write("spam_or_not_spam.bin", bincode::serialize(&bayes).unwrap()).unwrap(); + } + + #[test] + #[ignore] + fn classify() { + let model: BayesModel = + bincode::deserialize(&fs::read("spam_or_not_spam.bin").unwrap()).unwrap(); + let bayes = BayesClassifier::new(); + + for text in [ + "i am attaching to this email a presentation to integrate the spreadsheet into our server", + "buy this great product special offer sales", + "i m using simple dns from jhsoft we support only a few web sites and i d like to swap secondary services with someone in a similar position", + "viagra xenical vioxx zyban propecia we only offer the real viagra xenical ", + ] { + println!( + "{:?} -> {}", + text, + bayes + .classify(BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)).filter_map(|x| model.weights.get(&x.inner).map(|w| { + OsbToken { + idx: x.idx, + inner: *w, + } + })), model.ham_learns, model.spam_learns) + .unwrap() + ); + } + } +} diff --git a/crates/nlp/src/tokenizers/chinese.rs b/crates/nlp/src/tokenizers/chinese.rs new file mode 100644 index 00000000..f9ff355b --- /dev/null +++ b/crates/nlp/src/tokenizers/chinese.rs @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2023, Stalwart Labs Ltd. + * + * This file is part of Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::{borrow::Cow, vec::IntoIter}; + +use jieba_rs::Jieba; + +use super::{InnerToken, Token}; +use lazy_static::lazy_static; + +lazy_static! { + static ref JIEBA: Jieba = Jieba::new(); +} + +pub struct ChineseTokenizer<'x, T, I> +where + T: Iterator>, + I: InnerToken<'x>, +{ + tokenizer: T, + tokens: IntoIter>, + phantom: std::marker::PhantomData<&'x str>, +} + +impl<'x, T, I> ChineseTokenizer<'x, T, I> +where + T: Iterator>, + I: InnerToken<'x>, +{ + pub fn new(tokenizer: T) -> Self { + ChineseTokenizer { + tokenizer, + tokens: Vec::new().into_iter(), + phantom: std::marker::PhantomData, + } + } +} + +impl<'x, T, I> Iterator for ChineseTokenizer<'x, T, I> +where + T: Iterator>, + I: InnerToken<'x>, +{ + type Item = Token; + + fn next(&mut self) -> Option { + loop { + if let Some(token) = self.tokens.next() { + return Some(token); + } else { + let token = self.tokenizer.next()?; + if token.word.is_alphabetic_8bit() { + let mut token_to = token.from; + match token.word.unwrap_alphabetic() { + Cow::Borrowed(word) => { + self.tokens = JIEBA + .cut(word, false) + .into_iter() + .map(|word| { + let token_from = token_to; + token_to += word.len(); + Token { + word: I::new_alphabetic(word), + from: token_from, + to: token_to, + } + }) + .collect::>() + .into_iter(); + } + Cow::Owned(word) => { + self.tokens = JIEBA + .cut(&word, false) + .into_iter() + .map(|word| { + let token_from = token_to; + token_to += word.len(); + Token { + word: I::new_alphabetic(word.to_string()), + from: token_from, + to: token_to, + } + }) + .collect::>() + .into_iter(); + } + } + } else { + return token.into(); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::tokenizers::{chinese::ChineseTokenizer, word::WordTokenizer, Token}; + + #[test] + fn chinese_tokenizer() { + assert_eq!( + ChineseTokenizer::new(WordTokenizer::new( + "孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。", + 40 + ),) + .collect::>(), + vec![ + Token { + word: "孫".into(), + from: 0, + to: 3 + }, + Token { + word: "子".into(), + from: 3, + to: 6 + }, + Token { + word: "曰".into(), + from: 6, + to: 9 + }, + Token { + word: "兵".into(), + from: 12, + to: 15 + }, + Token { + word: "者".into(), + from: 15, + to: 18 + }, + Token { + word: "國".into(), + from: 21, + to: 24 + }, + Token { + word: "之".into(), + from: 24, + to: 27 + }, + Token { + word: "大事".into(), + from: 27, + to: 33 + }, + Token { + word: "死".into(), + from: 36, + to: 39 + }, + Token { + word: "生".into(), + from: 39, + to: 42 + }, + Token { + word: "之".into(), + from: 42, + to: 45 + }, + Token { + word: "地".into(), + from: 45, + to: 48 + }, + Token { + word: "存亡".into(), + from: 51, + to: 57 + }, + Token { + word: "之".into(), + from: 57, + to: 60 + }, + Token { + word: "道".into(), + from: 60, + to: 63 + }, + Token { + word: "不可不".into(), + from: 66, + to: 75 + }, + Token { + word: "察".into(), + from: 75, + to: 78 + }, + Token { + word: "也".into(), + from: 78, + to: 81 + } + ] + ); + } +} diff --git a/crates/nlp/src/tokenizers/japanese.rs b/crates/nlp/src/tokenizers/japanese.rs new file mode 100644 index 00000000..d0762c86 --- /dev/null +++ b/crates/nlp/src/tokenizers/japanese.rs @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2023, Stalwart Labs Ltd. + * + * This file is part of Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::vec::IntoIter; + +use super::{InnerToken, Token}; + +pub struct JapaneseTokenizer<'x, T, I> +where + T: Iterator>, + I: InnerToken<'x>, +{ + tokenizer: T, + tokens: IntoIter>, + phantom: std::marker::PhantomData<&'x str>, +} + +impl<'x, T, I> JapaneseTokenizer<'x, T, I> +where + T: Iterator>, + I: InnerToken<'x>, +{ + pub fn new(tokenizer: T) -> Self { + JapaneseTokenizer { + tokenizer, + tokens: Vec::new().into_iter(), + phantom: std::marker::PhantomData, + } + } +} + +impl<'x, T, I> Iterator for JapaneseTokenizer<'x, T, I> +where + T: Iterator>, + I: InnerToken<'x>, +{ + type Item = Token; + + fn next(&mut self) -> Option { + loop { + if let Some(token) = self.tokens.next() { + return Some(token); + } else { + let token = self.tokenizer.next()?; + if token.word.is_alphabetic_8bit() { + let mut token_to = token.from; + self.tokens = tinysegmenter::tokenize(token.word.unwrap_alphabetic().as_ref()) + .into_iter() + .map(|word| { + let token_from = token_to; + token_to += word.len(); + Token { + word: I::new_alphabetic(word.to_string()), + from: token_from, + to: token_to, + } + }) + .collect::>() + .into_iter(); + } else { + return token.into(); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::tokenizers::{japanese::JapaneseTokenizer, word::WordTokenizer, Token}; + + #[test] + fn japanese_tokenizer() { + assert_eq!( + JapaneseTokenizer::new(WordTokenizer::new( + "お先に失礼します あなたの名前は何ですか 123 abc-872", + 40 + )) + .collect::>(), + vec![ + Token { + word: "お先".into(), + from: 0, + to: 6 + }, + Token { + word: "に".into(), + from: 6, + to: 9 + }, + Token { + word: "失礼".into(), + from: 9, + to: 15 + }, + Token { + word: "し".into(), + from: 15, + to: 18 + }, + Token { + word: "ます".into(), + from: 18, + to: 24 + }, + Token { + word: "あなた".into(), + from: 25, + to: 34 + }, + Token { + word: "の".into(), + from: 34, + to: 37 + }, + Token { + word: "名前".into(), + from: 37, + to: 43 + }, + Token { + word: "は".into(), + from: 43, + to: 46 + }, + Token { + word: "何".into(), + from: 46, + to: 49 + }, + Token { + word: "です".into(), + from: 49, + to: 55 + }, + Token { + word: "か".into(), + from: 55, + to: 58 + }, + Token { + word: "123".into(), + from: 59, + to: 62 + }, + Token { + word: "abc".into(), + from: 63, + to: 66 + }, + Token { + word: "872".into(), + from: 67, + to: 70 + } + ] + ); + } +} diff --git a/crates/nlp/src/tokenizers/mod.rs b/crates/nlp/src/tokenizers/mod.rs new file mode 100644 index 00000000..a3e42d47 --- /dev/null +++ b/crates/nlp/src/tokenizers/mod.rs @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +pub mod chinese; +pub mod japanese; +pub mod space; +pub mod types; +pub mod word; + +use std::borrow::Cow; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token { + pub word: T, + pub from: usize, + pub to: usize, +} + +pub trait InnerToken<'x>: Sized { + fn new_alphabetic(value: impl Into>) -> Self; + fn unwrap_alphabetic(self) -> Cow<'x, str>; + fn is_alphabetic(&self) -> bool; + fn is_alphabetic_8bit(&self) -> bool; +} + +impl<'x> InnerToken<'x> for Cow<'x, str> { + fn new_alphabetic(value: impl Into>) -> Self { + value.into() + } + + fn is_alphabetic(&self) -> bool { + true + } + + fn is_alphabetic_8bit(&self) -> bool { + !self.chars().all(|c| c.is_ascii()) + } + + fn unwrap_alphabetic(self) -> Cow<'x, str> { + self + } +} + +impl Token { + pub fn new(offset: usize, len: usize, word: T) -> Token { + debug_assert!(offset <= u32::max_value() as usize); + debug_assert!(len <= u8::max_value() as usize); + Token { + from: offset, + to: offset + len, + word, + } + } +} diff --git a/crates/store/src/fts/tokenizers/space.rs b/crates/nlp/src/tokenizers/space.rs similarity index 100% rename from crates/store/src/fts/tokenizers/space.rs rename to crates/nlp/src/tokenizers/space.rs diff --git a/crates/nlp/src/tokenizers/types.rs b/crates/nlp/src/tokenizers/types.rs new file mode 100644 index 00000000..97e9bccb --- /dev/null +++ b/crates/nlp/src/tokenizers/types.rs @@ -0,0 +1,2878 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::str::CharIndices; + +use crate::PublicSuffix; + +use super::Token; + +pub struct TypesTokenizer<'x, 'y> { + text: &'x str, + suffixes: &'y PublicSuffix, + iter: CharIndices<'x>, + tokens: Vec>>, + peek_pos: usize, + last_ch_is_space: bool, + last_token_is_dot: bool, + eof: bool, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenType<'x> { + Alphabetic(&'x str), + Integer(&'x str), + Alphanumeric(&'x str), + Hexadecimal(&'x str), + Other(char), + Punctuation(char), + Space, + + // Detected types + Url(&'x str), + UrlNoScheme(&'x str), + UrlNoHost(&'x str), + Email(&'x str), + Float(&'x str), +} + +impl Copy for Token> {} + +impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> { + type Item = Token>; + + fn next(&mut self) -> Option { + let token = self.peek()?; + let last_is_dot = self.last_token_is_dot; + self.last_token_is_dot = matches!(token.word, TokenType::Punctuation('.')); + + // Try parsing URL with scheme + if matches!( + token.word, + TokenType::Alphabetic(t) | TokenType::Hexadecimal(t) + if t.len() <= 8 && t.chars().all(|c| c.is_ascii())) + && self.try_skip_url_scheme() + { + if let Some(url) = self.try_parse_url(token.into()) { + self.peek_advance(); + return Some(url); + } else { + self.peek_rewind(); + } + } + + // Try parsing email + if token.word.is_email_atom() + && self.peek_has_tokens( + &[TokenType::Punctuation('@'), TokenType::Punctuation('.')], + TokenType::Space, + ) + { + if let Some(email) = self.try_parse_email() { + self.peek_advance(); + return Some(email); + } else { + self.peek_rewind(); + } + } + + // Try parsing URL without scheme + if token.word.is_domain_atom(true) + && self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space) + { + if let Some(url) = self.try_parse_url(None) { + self.peek_advance(); + return Some(url); + } else { + self.peek_rewind(); + } + } + + // Try parsing currencies and floating point numbers + if !last_is_dot { + if let Some(num) = self.try_parse_number() { + self.peek_advance(); + return Some(num); + } + } + + self.peek_rewind(); + self.next_() + } +} + +impl<'x, 'y> TypesTokenizer<'x, 'y> { + pub fn new(text: &'x str, suffixes: &'y PublicSuffix) -> Self { + Self { + text, + iter: text.char_indices(), + tokens: Vec::new(), + eof: false, + peek_pos: 0, + suffixes, + last_ch_is_space: false, + last_token_is_dot: false, + } + } + + fn consume(&mut self) -> bool { + let mut has_alpha = false; + let mut has_number = false; + let mut has_hex = false; + + let mut start_pos = usize::MAX; + let mut end_pos = usize::MAX; + + let mut stop_char = None; + + for (pos, ch) in self.iter.by_ref() { + if ch.is_alphabetic() { + if ch.is_ascii_hexdigit() { + has_hex = true; + } else { + has_alpha = true; + } + } else if ch.is_ascii_digit() { + has_number = true; + } else { + let last_was_space = self.last_ch_is_space; + self.last_ch_is_space = ch.is_ascii_whitespace(); + stop_char = Token { + word: if self.last_ch_is_space { + if last_was_space { + continue; + } else { + TokenType::Space + } + } else if ch.is_ascii() { + TokenType::Punctuation(ch) + } else { + TokenType::Other(ch) + }, + from: pos, + to: pos + ch.len_utf8(), + } + .into(); + break; + } + self.last_ch_is_space = false; + + if start_pos == usize::MAX { + start_pos = pos; + } + end_pos = pos + ch.len_utf8(); + } + + if start_pos != usize::MAX { + let text = &self.text[start_pos..end_pos]; + + self.tokens.push(Token { + word: if has_alpha && has_number { + TokenType::Alphanumeric(text) + } else if has_alpha { + TokenType::Alphabetic(text) + } else if has_hex { + TokenType::Hexadecimal(text) + } else { + TokenType::Integer(text) + }, + from: start_pos, + to: end_pos, + }); + if let Some(stop_char) = stop_char { + self.tokens.push(stop_char); + } + true + } else if let Some(stop_char) = stop_char { + self.tokens.push(stop_char); + true + } else { + self.eof = true; + false + } + } + + fn next_(&mut self) -> Option>> { + if self.tokens.is_empty() && !self.eof { + self.consume(); + } + if !self.tokens.is_empty() { + Some(self.tokens.remove(0)) + } else { + None + } + } + + fn peek(&mut self) -> Option>> { + while self.tokens.len() <= self.peek_pos && !self.eof { + self.consume(); + } + self.tokens.get(self.peek_pos).map(|t| { + self.peek_pos += 1; + *t + }) + } + + fn peek_advance(&mut self) { + if self.peek_pos > 0 { + self.tokens.drain(..self.peek_pos); + self.peek_pos = 0; + } + } + + fn peek_rewind(&mut self) { + self.peek_pos = 0; + } + + fn peek_has_tokens(&mut self, tokens: &[TokenType<'_>], stop_token: TokenType<'_>) -> bool { + let mut tokens = tokens.iter().copied(); + let mut token = tokens.next().unwrap(); + while let Some(t) = self.peek() { + if t.word == token { + if let Some(next_token) = tokens.next() { + token = next_token; + } else { + self.peek_rewind(); + return true; + } + } else if t.word == stop_token { + break; + } + } + + self.peek_rewind(); + false + } + + fn try_parse_url( + &mut self, + scheme_token: Option>>, + ) -> Option>> { + let (has_scheme, allow_blank_host) = scheme_token.as_ref().map_or((false, false), |t| { + ( + true, + matches!(t.word, TokenType::Alphabetic(s) if s.eq_ignore_ascii_case("file")), + ) + }); + if has_scheme { + let restore_pos = self.peek_pos; + let mut has_user_info = false; + while let Some(token) = self.peek() { + match token.word { + TokenType::Punctuation('@') => { + has_user_info = true; + break; + } + TokenType::Alphabetic(_) + | TokenType::Alphanumeric(_) + | TokenType::Integer(_) + | TokenType::Hexadecimal(_) + | TokenType::Punctuation( + '-' | '.' | '_' | '~' | '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' + | ',' | ';' | '=' | ':', + ) => (), + _ => break, + } + } + + if !has_user_info { + self.peek_pos = restore_pos; + } + } + + // Try parsing hostname + let mut is_valid_host = true; + let (host_start_pos, mut end_pos) = if has_scheme { + let mut start_pos = usize::MAX; + let mut end_pos = usize::MAX; + let mut restore_pos = self.peek_pos; + + let mut text_count = 0; + let mut int_count = 0; + let mut dot_count = 0; + let mut is_ipv6 = false; + + let mut last_label_is_tld = false; + + while let Some(token) = self.peek() { + match token.word { + TokenType::Alphabetic(text) + | TokenType::Alphanumeric(text) + | TokenType::Hexadecimal(text) => { + last_label_is_tld = + text.len() >= 2 && self.suffixes.contains(&text.to_ascii_lowercase()); + text_count += 1; + } + TokenType::Integer(text) => { + if text.len() <= 3 { + int_count += 1; + } + } + TokenType::Punctuation('.') => { + dot_count += 1; + continue; + } + TokenType::Punctuation('[') if start_pos == usize::MAX => { + let (_, to) = self.try_parse_ipv6(token.from)?; + start_pos = token.from; + end_pos = to; + restore_pos = self.peek_pos; + is_ipv6 = true; + break; + } + TokenType::Punctuation( + '-' | '_' | '~' | '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' + | ';' | '=' | ':' | '%', + ) => { + continue; + } + TokenType::Punctuation('/') if allow_blank_host => { + // Allow file://../ urls + end_pos = token.from; + restore_pos = self.peek_pos - 1; + break; + } + _ => break, + } + + if start_pos == usize::MAX { + start_pos = token.from; + } + end_pos = token.to; + restore_pos = self.peek_pos; + } + + self.peek_pos = restore_pos; + if end_pos != usize::MAX { + is_valid_host = + (last_label_is_tld && dot_count >= 1 && (text_count + int_count) >= 2) + || (int_count == 4 && dot_count == 3) + || is_ipv6; + (start_pos, end_pos) + } else { + return None; + } + } else { + // Strict hostname parsing + self.try_parse_hostname()? + }; + + // Try parsing port + let start_pos = scheme_token.map(|t| t.from).unwrap_or(host_start_pos); + let mut restore_pos = self.peek_pos; + let mut has_port = false; + let mut last_is_colon = false; + let mut found_query_start = false; + while let Some(token) = self.peek() { + match token.word { + TokenType::Punctuation(':') if !last_is_colon && !has_port => { + last_is_colon = true; + } + TokenType::Integer(_) if last_is_colon => { + has_port = true; + last_is_colon = false; + restore_pos = self.peek_pos; + end_pos = token.to; + } + TokenType::Punctuation('/' | '?') if !last_is_colon => { + found_query_start = true; + end_pos = token.to; + break; + } + _ => { + self.peek_pos = restore_pos; + break; + } + } + } + + // Try parsing query + if found_query_start { + restore_pos = self.peek_pos; + let mut p_count = 0; + let mut b_count = 0; + let mut c_count = 0; + let mut seen_quote = false; + while let Some(token) = self.peek() { + match token.word { + TokenType::Alphabetic(_) + | TokenType::Alphanumeric(_) + | TokenType::Integer(_) + | TokenType::Hexadecimal(_) + | TokenType::Other(_) => {} + TokenType::Punctuation('(') => { + p_count += 1; + continue; + } + TokenType::Punctuation('[') => { + b_count += 1; + continue; + } + TokenType::Punctuation('{') => { + c_count += 1; + continue; + } + TokenType::Punctuation(')') if p_count > 0 => { + p_count -= 1; + } + TokenType::Punctuation(']') if b_count > 0 => { + b_count -= 1; + } + TokenType::Punctuation('}') if c_count > 0 => { + c_count -= 1; + } + TokenType::Punctuation('\'') => { + if !seen_quote { + seen_quote = true; + continue; + } else { + seen_quote = false; + } + } + TokenType::Punctuation('/') => {} + TokenType::Punctuation( + '-' | '_' | '~' | '!' | '$' | '&' | '*' | '+' | ',' | ';' | '=' | ':' | '%' + | '?' | '.' | '@', + ) => { + continue; + } + _ => break, + } + end_pos = token.to; + restore_pos = self.peek_pos; + } + self.peek_pos = restore_pos; + } + + Token { + word: if has_scheme { + if is_valid_host { + TokenType::Url(&self.text[start_pos..end_pos]) + } else { + TokenType::UrlNoHost(&self.text[start_pos..end_pos]) + } + } else { + TokenType::UrlNoScheme(&self.text[start_pos..end_pos]) + }, + from: start_pos, + to: end_pos, + } + .into() + } + + fn try_parse_email(&mut self) -> Option>> { + // Start token is a valid local part atom + let start_token = self.peek()?; + let mut last_is_dot = false; + + // Find local part + loop { + let token = self.peek()?; + match token.word { + word if word.is_email_atom() => { + last_is_dot = false; + } + TokenType::Punctuation('@') if !last_is_dot => { + break; + } + TokenType::Punctuation('.') if !last_is_dot => { + last_is_dot = true; + } + _ => { + return None; + } + } + } + + // Obtain domain part + let (_, end_pos) = self.try_parse_hostname()?; + + Token { + word: TokenType::Email(&self.text[start_token.from..end_pos]), + from: start_token.from, + to: end_pos, + } + .into() + } + + fn try_parse_hostname(&mut self) -> Option<(usize, usize)> { + let mut last_ch = u8::MAX; + let mut has_int = false; + let mut has_alpha = false; + let mut last_label_is_tld = false; + + let mut dot_count = 0; + let mut start_pos = usize::MAX; + let mut end_pos = usize::MAX; + let mut restore_pos = self.peek_pos; + + while let Some(token) = self.peek() { + match token.word { + TokenType::Punctuation('.') if last_ch == 0 && start_pos != usize::MAX => { + last_ch = b'.'; + dot_count += 1; + continue; + } + TokenType::Punctuation('-') if last_ch == 0 || last_ch == b'-' => { + last_ch = b'-'; + continue; + } + TokenType::Punctuation('[') if start_pos == usize::MAX => { + return self.try_parse_ipv6(token.from); + } + TokenType::Alphabetic(text) + | TokenType::Alphanumeric(text) + | TokenType::Hexadecimal(text) + if text.len() <= 63 => + { + last_label_is_tld = + text.len() >= 2 && self.suffixes.contains(&text.to_ascii_lowercase()); + has_alpha = true; + last_ch = 0; + } + TokenType::Other(_) => { + has_alpha = true; + last_label_is_tld = false; + last_ch = 0; + } + TokenType::Integer(text) => { + if text.len() <= 3 { + has_int = true; + } + last_label_is_tld = false; + last_ch = 0; + } + _ => { + break; + } + } + + if start_pos == usize::MAX { + start_pos = token.from; + } + end_pos = token.to; + restore_pos = self.peek_pos; + } + self.peek_pos = restore_pos; + + if last_ch == b'.' { + dot_count -= 1; + } + + if end_pos != usize::MAX + && dot_count >= 1 + && (last_label_is_tld || (has_int && !has_alpha && dot_count == 3)) + { + (start_pos, end_pos).into() + } else { + None + } + } + + fn try_parse_ipv6(&mut self, start_pos: usize) -> Option<(usize, usize)> { + let mut found_colon = false; + let mut last_ch = u8::MAX; + + while let Some(token) = self.peek() { + match token.word { + TokenType::Integer(_) | TokenType::Hexadecimal(_) => { + last_ch = 0; + } + TokenType::Punctuation(':') if last_ch != b'.' => { + found_colon = true; + last_ch = b':'; + } + TokenType::Punctuation('.') if last_ch == 0 => { + last_ch = b'.'; + } + TokenType::Punctuation(']') if found_colon && last_ch == 0 => { + return (start_pos, token.to).into(); + } + _ => return None, + } + } + + None + } + + fn try_parse_number(&mut self) -> Option>> { + self.peek_rewind(); + let mut start_pos = usize::MAX; + let mut end_pos = usize::MAX; + let mut restore_pos = self.peek_pos; + + let mut seen_integer = 0; + let mut seen_dot = false; + + while let Some(token) = self.peek() { + match token.word { + TokenType::Punctuation('-') if start_pos == usize::MAX => {} + TokenType::Integer(_) if seen_integer == 0 || seen_dot => { + seen_integer += 1; + } + TokenType::Punctuation('.') if seen_integer != 0 => { + if !seen_dot { + seen_dot = true; + continue; + } else { + // Avoid parsing num.num.num as floats + return None; + } + } + _ => break, + } + + if start_pos == usize::MAX { + start_pos = token.from; + } + end_pos = token.to; + restore_pos = self.peek_pos; + } + + self.peek_pos = restore_pos; + + if seen_integer > 0 { + let text = &self.text[start_pos..end_pos]; + + Token { + word: if seen_integer == 2 { + TokenType::Float(text) + } else { + TokenType::Integer(text) + }, + from: start_pos, + to: end_pos, + } + .into() + } else { + None + } + } + + fn try_skip_url_scheme(&mut self) -> bool { + enum State { + None, + PlusAlpha, + Colon, + Slash1, + Slash2, + } + let mut state = State::None; + + while let Some(token) = self.peek() { + state = match (token.word, state) { + (TokenType::Punctuation(':'), State::None | State::Colon) => State::Slash1, + (TokenType::Punctuation('/'), State::Slash1) => State::Slash2, + (TokenType::Punctuation('/'), State::Slash2) => return true, + (TokenType::Punctuation('+'), State::None) => State::PlusAlpha, + (TokenType::Alphabetic(t) | TokenType::Hexadecimal(t), State::PlusAlpha) + if t.chars().all(|c| c.is_ascii()) => + { + State::Colon + } + _ => break, + }; + } + self.peek_rewind(); + false + } +} + +impl<'x> TokenType<'x> { + fn is_email_atom(&self) -> bool { + matches!( + self, + TokenType::Alphabetic(_) + | TokenType::Integer(_) + | TokenType::Alphanumeric(_) + | TokenType::Hexadecimal(_) + | TokenType::Other(_) + | TokenType::Punctuation( + '!' | '#' + | '$' + | '%' + | '&' + | '\'' + | '*' + | '+' + | '-' + | '/' + | '=' + | '?' + | '^' + | '_' + | '`' + | '{' + | '|' + | '}' + | '~', + ) + ) + } + + fn is_domain_atom(&self, is_start: bool) -> bool { + matches!( + self, + TokenType::Alphabetic(_) + | TokenType::Integer(_) + | TokenType::Alphanumeric(_) + | TokenType::Hexadecimal(_) + | TokenType::Other(_) + ) || (!is_start && matches!(self, TokenType::Punctuation('-'))) + } +} + +#[cfg(test)] +mod test { + use crate::PublicSuffix; + + use super::{TokenType, TypesTokenizer}; + + #[test] + fn type_tokenizer() { + let mut suffixes = PublicSuffix::default(); + suffixes.suffixes.insert("com".to_string()); + suffixes.suffixes.insert("co".to_string()); + suffixes.suffixes.insert("org".to_string()); + + // Credits: test suite from linkify crate + for (text, expected) in [ + ("", vec![]), + ("foo", vec![TokenType::Alphabetic("foo")]), + (":", vec![TokenType::Punctuation(':')]), + ( + "://", + vec![ + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + ], + ), + ( + ":::", + vec![ + TokenType::Punctuation(':'), + TokenType::Punctuation(':'), + TokenType::Punctuation(':'), + ], + ), + ( + "://foo", + vec![ + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("foo"), + ], + ), + ( + "1://foo", + vec![ + TokenType::Integer("1"), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("foo"), + ], + ), + ( + "123://foo", + vec![ + TokenType::Integer("123"), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("foo"), + ], + ), + ( + "+://foo", + vec![ + TokenType::Punctuation('+'), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("foo"), + ], + ), + ( + "-://foo", + vec![ + TokenType::Punctuation('-'), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("foo"), + ], + ), + ( + ".://foo", + vec![ + TokenType::Punctuation('.'), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("foo"), + ], + ), + ("1abc://foo", vec![TokenType::UrlNoHost("1abc://foo")]), + ("a://foo", vec![TokenType::UrlNoHost("a://foo")]), + ("a123://foo", vec![TokenType::UrlNoHost("a123://foo")]), + ("a123b://foo", vec![TokenType::UrlNoHost("a123b://foo")]), + ("a+b://foo", vec![TokenType::UrlNoHost("a+b://foo")]), + ( + "a-b://foo", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('-'), + TokenType::UrlNoHost("b://foo"), + ], + ), + ( + "a.b://foo", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('.'), + TokenType::UrlNoHost("b://foo"), + ], + ), + ("ABC://foo", vec![TokenType::UrlNoHost("ABC://foo")]), + ( + ".http://example.org/", + vec![ + TokenType::Punctuation('.'), + TokenType::Url("http://example.org/"), + ], + ), + ( + "1.http://example.org/", + vec![ + TokenType::Integer("1"), + TokenType::Punctuation('.'), + TokenType::Url("http://example.org/"), + ], + ), + ( + "ab://", + vec![ + TokenType::Hexadecimal("ab"), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + ], + ), + ( + "file://", + vec![ + TokenType::Alphabetic("file"), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + ], + ), + ( + "file:// ", + vec![ + TokenType::Alphabetic("file"), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Space, + ], + ), + ( + "\"file://\"", + vec![ + TokenType::Punctuation('"'), + TokenType::Alphabetic("file"), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Punctuation('"'), + ], + ), + ( + "\"file://...\", ", + vec![ + TokenType::Punctuation('"'), + TokenType::Alphabetic("file"), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + TokenType::Punctuation('"'), + TokenType::Punctuation(','), + TokenType::Space, + ], + ), + ( + "file://somefile", + vec![TokenType::UrlNoHost("file://somefile")], + ), + ( + "file://../relative", + vec![TokenType::UrlNoHost("file://../relative")], + ), + ( + "http://a.", + vec![ + TokenType::UrlNoHost("http://a"), + TokenType::Punctuation('.'), + ], + ), + ("http://127.0.0.1", vec![TokenType::Url("http://127.0.0.1")]), + ( + "http://127.0.0.1/", + vec![TokenType::Url("http://127.0.0.1/")], + ), + ("ab://c", vec![TokenType::UrlNoHost("ab://c")]), + ( + "http://example.org/", + vec![TokenType::Url("http://example.org/")], + ), + ( + "http://example.org/123", + vec![TokenType::Url("http://example.org/123")], + ), + ( + "http://example.org/?foo=test&bar=123", + vec![TokenType::Url("http://example.org/?foo=test&bar=123")], + ), + ( + "http://example.org/?foo=%20", + vec![TokenType::Url("http://example.org/?foo=%20")], + ), + ( + "http://example.org/%3C", + vec![TokenType::Url("http://example.org/%3C")], + ), + ("example.org/", vec![TokenType::UrlNoScheme("example.org/")]), + ( + "example.org/123", + vec![TokenType::UrlNoScheme("example.org/123")], + ), + ( + "example.org/?foo=test&bar=123", + vec![TokenType::UrlNoScheme("example.org/?foo=test&bar=123")], + ), + ( + "example.org/?foo=%20", + vec![TokenType::UrlNoScheme("example.org/?foo=%20")], + ), + ( + "example.org/%3C", + vec![TokenType::UrlNoScheme("example.org/%3C")], + ), + ( + "foo http://example.org/", + vec![ + TokenType::Alphabetic("foo"), + TokenType::Space, + TokenType::Url("http://example.org/"), + ], + ), + ( + "http://example.org/ bar", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "http://example.org/\tbar", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "http://example.org/\nbar", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "http://example.org/\u{b}bar", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('\u{b}'), + TokenType::Alphabetic("bar"), + ], + ), + ( + "http://example.org/\u{c}bar", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "http://example.org/\rbar", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "foo example.org/", + vec![ + TokenType::Alphabetic("foo"), + TokenType::Space, + TokenType::UrlNoScheme("example.org/"), + ], + ), + ( + "example.org/ bar", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "example.org/\tbar", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "example.org/\nbar", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "example.org/\u{b}bar", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('\u{b}'), + TokenType::Alphabetic("bar"), + ], + ), + ( + "example.org/\u{c}bar", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "example.org/\rbar", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Space, + TokenType::Alphabetic("bar"), + ], + ), + ( + "http://example.org/<", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('<'), + ], + ), + ( + "http://example.org/>", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org/<>", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('<'), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org/\0", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('\0'), + ], + ), + ( + "http://example.org/\u{e}", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('\u{e}'), + ], + ), + ( + "http://example.org/\u{7f}", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('\u{7f}'), + ], + ), + ( + "http://example.org/\u{9f}", + vec![TokenType::Url("http://example.org/\u{9f}")], + ), + ( + "http://example.org/foo|bar", + vec![ + TokenType::Url("http://example.org/foo"), + TokenType::Punctuation('|'), + TokenType::Alphabetic("bar"), + ], + ), + ( + "example.org/<", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('<'), + ], + ), + ( + "example.org/>", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org/<>", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('<'), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org/\0", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('\0'), + ], + ), + ( + "example.org/\u{e}", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('\u{e}'), + ], + ), + ( + "example.org/\u{7f}", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('\u{7f}'), + ], + ), + ( + "example.org/\u{9f}", + vec![TokenType::UrlNoScheme("example.org/\u{9f}")], + ), + ( + "http://example.org/.", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('.'), + ], + ), + ( + "http://example.org/..", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + ], + ), + ( + "http://example.org/,", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation(','), + ], + ), + ( + "http://example.org/:", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation(':'), + ], + ), + ( + "http://example.org/?", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('?'), + ], + ), + ( + "http://example.org/!", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('!'), + ], + ), + ( + "http://example.org/;", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation(';'), + ], + ), + ( + "example.org/.", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('.'), + ], + ), + ( + "example.org/..", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + ], + ), + ( + "example.org/,", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(','), + ], + ), + ( + "example.org/:", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(':'), + ], + ), + ( + "example.org/?", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('?'), + ], + ), + ( + "example.org/!", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('!'), + ], + ), + ( + "example.org/;", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(';'), + ], + ), + ( + "http://example.org/a(b)", + vec![TokenType::Url("http://example.org/a(b)")], + ), + ( + "http://example.org/a[b]", + vec![TokenType::Url("http://example.org/a[b]")], + ), + ( + "http://example.org/a{b}", + vec![TokenType::Url("http://example.org/a{b}")], + ), + ( + "http://example.org/a'b'", + vec![TokenType::Url("http://example.org/a'b'")], + ), + ( + "(http://example.org/)", + vec![ + TokenType::Punctuation('('), + TokenType::Url("http://example.org/"), + TokenType::Punctuation(')'), + ], + ), + ( + "[http://example.org/]", + vec![ + TokenType::Punctuation('['), + TokenType::Url("http://example.org/"), + TokenType::Punctuation(']'), + ], + ), + ( + "{http://example.org/}", + vec![ + TokenType::Punctuation('{'), + TokenType::Url("http://example.org/"), + TokenType::Punctuation('}'), + ], + ), + ( + "\"http://example.org/\"", + vec![ + TokenType::Punctuation('"'), + TokenType::Url("http://example.org/"), + TokenType::Punctuation('"'), + ], + ), + ( + "'http://example.org/'", + vec![ + TokenType::Punctuation('\''), + TokenType::Url("http://example.org/"), + TokenType::Punctuation('\''), + ], + ), + ( + "example.org/a(b)", + vec![TokenType::UrlNoScheme("example.org/a(b)")], + ), + ( + "example.org/a[b]", + vec![TokenType::UrlNoScheme("example.org/a[b]")], + ), + ( + "example.org/a{b}", + vec![TokenType::UrlNoScheme("example.org/a{b}")], + ), + ( + "example.org/a'b'", + vec![TokenType::UrlNoScheme("example.org/a'b'")], + ), + ( + "(example.org/)", + vec![ + TokenType::Punctuation('('), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(')'), + ], + ), + ( + "[example.org/]", + vec![ + TokenType::Punctuation('['), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(']'), + ], + ), + ( + "{example.org/}", + vec![ + TokenType::Punctuation('{'), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('}'), + ], + ), + ( + "\"example.org/\"", + vec![ + TokenType::Punctuation('"'), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('"'), + ], + ), + ( + "'example.org/'", + vec![ + TokenType::Punctuation('\''), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('\''), + ], + ), + ( + "((http://example.org/))", + vec![ + TokenType::Punctuation('('), + TokenType::Punctuation('('), + TokenType::Url("http://example.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation(')'), + ], + ), + ( + "((http://example.org/a(b)))", + vec![ + TokenType::Punctuation('('), + TokenType::Punctuation('('), + TokenType::Url("http://example.org/a(b)"), + TokenType::Punctuation(')'), + TokenType::Punctuation(')'), + ], + ), + ( + "[(http://example.org/)]", + vec![ + TokenType::Punctuation('['), + TokenType::Punctuation('('), + TokenType::Url("http://example.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation(']'), + ], + ), + ( + "(http://example.org/).", + vec![ + TokenType::Punctuation('('), + TokenType::Url("http://example.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation('.'), + ], + ), + ( + "(http://example.org/.)", + vec![ + TokenType::Punctuation('('), + TokenType::Url("http://example.org/"), + TokenType::Punctuation('.'), + TokenType::Punctuation(')'), + ], + ), + ( + "http://example.org/>", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org/(", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('('), + ], + ), + ( + "http://example.org/(.", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('('), + TokenType::Punctuation('.'), + ], + ), + ( + "http://example.org/]()", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation(']'), + TokenType::Punctuation('('), + TokenType::Punctuation(')'), + ], + ), + ( + "((example.org/))", + vec![ + TokenType::Punctuation('('), + TokenType::Punctuation('('), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation(')'), + ], + ), + ( + "((example.org/a(b)))", + vec![ + TokenType::Punctuation('('), + TokenType::Punctuation('('), + TokenType::UrlNoScheme("example.org/a(b)"), + TokenType::Punctuation(')'), + TokenType::Punctuation(')'), + ], + ), + ( + "[(example.org/)]", + vec![ + TokenType::Punctuation('['), + TokenType::Punctuation('('), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation(']'), + ], + ), + ( + "(example.org/).", + vec![ + TokenType::Punctuation('('), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation('.'), + ], + ), + ( + "(example.org/.)", + vec![ + TokenType::Punctuation('('), + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('.'), + TokenType::Punctuation(')'), + ], + ), + ( + "example.org/>", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org/(", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('('), + ], + ), + ( + "example.org/(.", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('('), + TokenType::Punctuation('.'), + ], + ), + ( + "example.org/]()", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation(']'), + TokenType::Punctuation('('), + TokenType::Punctuation(')'), + ], + ), + ( + "'https://example.org'", + vec![ + TokenType::Punctuation('\''), + TokenType::Url("https://example.org"), + TokenType::Punctuation('\''), + ], + ), + ( + "\"https://example.org\"", + vec![ + TokenType::Punctuation('"'), + TokenType::Url("https://example.org"), + TokenType::Punctuation('"'), + ], + ), + ( + "''https://example.org''", + vec![ + TokenType::Punctuation('\''), + TokenType::Punctuation('\''), + TokenType::Url("https://example.org"), + TokenType::Punctuation('\''), + TokenType::Punctuation('\''), + ], + ), + ( + "'https://example.org''", + vec![ + TokenType::Punctuation('\''), + TokenType::Url("https://example.org"), + TokenType::Punctuation('\''), + TokenType::Punctuation('\''), + ], + ), + ( + "'https://example.org", + vec![ + TokenType::Punctuation('\''), + TokenType::Url("https://example.org"), + ], + ), + ( + "http://example.org/'_(foo)", + vec![TokenType::Url("http://example.org/'_(foo)")], + ), + ( + "http://example.org/'_(foo)'", + vec![TokenType::Url("http://example.org/'_(foo)'")], + ), + ( + "http://example.org/''", + vec![TokenType::Url("http://example.org/''")], + ), + ( + "http://example.org/'''", + vec![ + TokenType::Url("http://example.org/''"), + TokenType::Punctuation('\''), + ], + ), + ( + "http://example.org/'.", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('\''), + TokenType::Punctuation('.'), + ], + ), + ( + "http://example.org/'a", + vec![TokenType::Url("http://example.org/'a")], + ), + ( + "http://example.org/it's", + vec![TokenType::Url("http://example.org/it's")], + ), + ( + "example.org/'_(foo)", + vec![TokenType::UrlNoScheme("example.org/'_(foo)")], + ), + ( + "example.org/'_(foo)'", + vec![TokenType::UrlNoScheme("example.org/'_(foo)'")], + ), + ( + "example.org/''", + vec![TokenType::UrlNoScheme("example.org/''")], + ), + ( + "example.org/'''", + vec![ + TokenType::UrlNoScheme("example.org/''"), + TokenType::Punctuation('\''), + ], + ), + ( + "example.org/'.", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('\''), + TokenType::Punctuation('.'), + ], + ), + ( + "example.org/'a", + vec![TokenType::UrlNoScheme("example.org/'a")], + ), + ( + "example.org/it's", + vec![TokenType::UrlNoScheme("example.org/it's")], + ), + ( + "http://example.org/\"a", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('"'), + TokenType::Hexadecimal("a"), + ], + ), + ( + "http://example.org/\"a\"", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('"'), + TokenType::Hexadecimal("a"), + TokenType::Punctuation('"'), + ], + ), + ( + "http://example.org/`a", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('`'), + TokenType::Hexadecimal("a"), + ], + ), + ( + "http://example.org/`a`", + vec![ + TokenType::Url("http://example.org/"), + TokenType::Punctuation('`'), + TokenType::Hexadecimal("a"), + TokenType::Punctuation('`'), + ], + ), + ( + "https://example.org*", + vec![ + TokenType::Url("https://example.org"), + TokenType::Punctuation('*'), + ], + ), + ( + "https://example.org/*", + vec![ + TokenType::Url("https://example.org/"), + TokenType::Punctuation('*'), + ], + ), + ( + "https://example.org/**", + vec![ + TokenType::Url("https://example.org/"), + TokenType::Punctuation('*'), + TokenType::Punctuation('*'), + ], + ), + ( + "https://example.org/*/a", + vec![TokenType::Url("https://example.org/*/a")], + ), + ( + "example.org/`a", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('`'), + TokenType::Hexadecimal("a"), + ], + ), + ( + "example.org/`a`", + vec![ + TokenType::UrlNoScheme("example.org/"), + TokenType::Punctuation('`'), + TokenType::Hexadecimal("a"), + TokenType::Punctuation('`'), + ], + ), + ( + "http://example.org\">", + vec![ + TokenType::Url("http://example.org"), + TokenType::Punctuation('"'), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org'>", + vec![ + TokenType::Url("http://example.org"), + TokenType::Punctuation('\''), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org\"/>", + vec![ + TokenType::Url("http://example.org"), + TokenType::Punctuation('"'), + TokenType::Punctuation('/'), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org'/>", + vec![ + TokenType::Url("http://example.org"), + TokenType::Punctuation('\''), + TokenType::Punctuation('/'), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org

", + vec![ + TokenType::Url("http://example.org"), + TokenType::Punctuation('<'), + TokenType::Alphabetic("p"), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org

", + vec![ + TokenType::Url("http://example.org"), + TokenType::Punctuation('<'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("p"), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org\">", + vec![ + TokenType::UrlNoScheme("example.org"), + TokenType::Punctuation('"'), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org'>", + vec![ + TokenType::UrlNoScheme("example.org"), + TokenType::Punctuation('\''), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org\"/>", + vec![ + TokenType::UrlNoScheme("example.org"), + TokenType::Punctuation('"'), + TokenType::Punctuation('/'), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org'/>", + vec![ + TokenType::UrlNoScheme("example.org"), + TokenType::Punctuation('\''), + TokenType::Punctuation('/'), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org

", + vec![ + TokenType::UrlNoScheme("example.org"), + TokenType::Punctuation('<'), + TokenType::Alphabetic("p"), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org

", + vec![ + TokenType::UrlNoScheme("example.org"), + TokenType::Punctuation('<'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("p"), + TokenType::Punctuation('>'), + ], + ), + ( + "http://example.org\");", + vec![ + TokenType::Url("http://example.org"), + TokenType::Punctuation('"'), + TokenType::Punctuation(')'), + TokenType::Punctuation(';'), + ], + ), + ( + "http://example.org');", + vec![ + TokenType::Url("http://example.org"), + TokenType::Punctuation('\''), + TokenType::Punctuation(')'), + TokenType::Punctuation(';'), + ], + ), + ( + "", + vec![ + TokenType::Punctuation('<'), + TokenType::Alphabetic("img"), + TokenType::Space, + TokenType::Alphabetic("src"), + TokenType::Punctuation('='), + TokenType::Punctuation('"'), + TokenType::Url("http://example.org/test.svg"), + TokenType::Punctuation('"'), + TokenType::Punctuation('>'), + ], + ), + ( + "
", + vec![ + TokenType::Punctuation('<'), + TokenType::Alphabetic("div"), + TokenType::Punctuation('>'), + TokenType::Punctuation('<'), + TokenType::Hexadecimal("a"), + TokenType::Space, + TokenType::Alphabetic("href"), + TokenType::Punctuation('='), + TokenType::Punctuation('"'), + TokenType::Url("http://example.org"), + TokenType::Punctuation('"'), + TokenType::Punctuation('>'), + TokenType::Punctuation('<'), + TokenType::Punctuation('/'), + TokenType::Hexadecimal("a"), + TokenType::Punctuation('>'), + TokenType::Punctuation('<'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("div"), + TokenType::Punctuation('>'), + ], + ), + ( + "
", + vec![ + TokenType::Punctuation('<'), + TokenType::Alphabetic("div"), + TokenType::Punctuation('>'), + TokenType::Punctuation('<'), + TokenType::Hexadecimal("a"), + TokenType::Space, + TokenType::Alphabetic("href"), + TokenType::Punctuation('='), + TokenType::Punctuation('"'), + TokenType::Url("http://example.org"), + TokenType::Punctuation('"'), + TokenType::Space, + TokenType::Punctuation('>'), + TokenType::Punctuation('<'), + TokenType::Punctuation('/'), + TokenType::Hexadecimal("a"), + TokenType::Punctuation('>'), + TokenType::Punctuation('<'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("div"), + TokenType::Punctuation('>'), + ], + ), + ( + "
\n \n
", + vec![ + TokenType::Punctuation('<'), + TokenType::Alphabetic("div"), + TokenType::Punctuation('>'), + TokenType::Space, + TokenType::Punctuation('<'), + TokenType::Alphabetic("img"), + TokenType::Space, + TokenType::Alphabetic("src"), + TokenType::Punctuation('='), + TokenType::Punctuation('"'), + TokenType::Url("http://example.org/test3.jpg"), + TokenType::Punctuation('"'), + TokenType::Space, + TokenType::Punctuation('/'), + TokenType::Punctuation('>'), + TokenType::Space, + TokenType::Punctuation('<'), + TokenType::Punctuation('/'), + TokenType::Alphabetic("div"), + TokenType::Punctuation('>'), + ], + ), + ( + "example.org\");", + vec![ + TokenType::UrlNoScheme("example.org"), + TokenType::Punctuation('"'), + TokenType::Punctuation(')'), + TokenType::Punctuation(';'), + ], + ), + ( + "example.org');", + vec![ + TokenType::UrlNoScheme("example.org"), + TokenType::Punctuation('\''), + TokenType::Punctuation(')'), + TokenType::Punctuation(';'), + ], + ), + ( + "http://example.org/", + vec![TokenType::Url("http://example.org/")], + ), + ( + "http://example.org/a/", + vec![TokenType::Url("http://example.org/a/")], + ), + ( + "http://example.org//", + vec![TokenType::Url("http://example.org//")], + ), + ("example.org/", vec![TokenType::UrlNoScheme("example.org/")]), + ( + "example.org/a/", + vec![TokenType::UrlNoScheme("example.org/a/")], + ), + ( + "example.org//", + vec![TokenType::UrlNoScheme("example.org//")], + ), + ( + "http://one.org/ http://two.org/", + vec![ + TokenType::Url("http://one.org/"), + TokenType::Space, + TokenType::Url("http://two.org/"), + ], + ), + ( + "http://one.org/ : http://two.org/", + vec![ + TokenType::Url("http://one.org/"), + TokenType::Space, + TokenType::Punctuation(':'), + TokenType::Space, + TokenType::Url("http://two.org/"), + ], + ), + ( + "(http://one.org/)(http://two.org/)", + vec![ + TokenType::Punctuation('('), + TokenType::Url("http://one.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation('('), + TokenType::Url("http://two.org/"), + TokenType::Punctuation(')'), + ], + ), + ( + "one.org/ two.org/", + vec![ + TokenType::UrlNoScheme("one.org/"), + TokenType::Space, + TokenType::UrlNoScheme("two.org/"), + ], + ), + ( + "one.org/ : two.org/", + vec![ + TokenType::UrlNoScheme("one.org/"), + TokenType::Space, + TokenType::Punctuation(':'), + TokenType::Space, + TokenType::UrlNoScheme("two.org/"), + ], + ), + ( + "(one.org/)(two.org/)", + vec![ + TokenType::Punctuation('('), + TokenType::UrlNoScheme("one.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation('('), + TokenType::UrlNoScheme("two.org/"), + TokenType::Punctuation(')'), + ], + ), + ( + "http://one.org/ two.org/", + vec![ + TokenType::Url("http://one.org/"), + TokenType::Space, + TokenType::UrlNoScheme("two.org/"), + ], + ), + ( + "one.org/ : http://two.org/", + vec![ + TokenType::UrlNoScheme("one.org/"), + TokenType::Space, + TokenType::Punctuation(':'), + TokenType::Space, + TokenType::Url("http://two.org/"), + ], + ), + ( + "(http://one.org/)(two.org/)", + vec![ + TokenType::Punctuation('('), + TokenType::Url("http://one.org/"), + TokenType::Punctuation(')'), + TokenType::Punctuation('('), + TokenType::UrlNoScheme("two.org/"), + TokenType::Punctuation(')'), + ], + ), + ( + "http://üñîçøðé.com", + vec![TokenType::Url("http://üñîçøðé.com")], + ), + ( + "http://üñîçøðé.com/ä", + vec![TokenType::Url("http://üñîçøðé.com/ä")], + ), + ( + "http://example.org/¡", + vec![TokenType::Url("http://example.org/¡")], + ), + ( + "http://example.org/¢", + vec![TokenType::Url("http://example.org/¢")], + ), + ( + "http://example.org/😀", + vec![TokenType::Url("http://example.org/😀")], + ), + ( + "http://example.org/¢/", + vec![TokenType::Url("http://example.org/¢/")], + ), + ( + "http://xn--c1h.example.com/", + vec![TokenType::Url("http://xn--c1h.example.com/")], + ), + ("üñîçøðé.com", vec![TokenType::UrlNoScheme("üñîçøðé.com")]), + ( + "üñîçøðé.com/ä", + vec![TokenType::UrlNoScheme("üñîçøðé.com/ä")], + ), + ( + "example.org/¡", + vec![TokenType::UrlNoScheme("example.org/¡")], + ), + ( + "example.org/¢", + vec![TokenType::UrlNoScheme("example.org/¢")], + ), + ( + "example.org/😀", + vec![TokenType::UrlNoScheme("example.org/😀")], + ), + ( + "example.org/¢/", + vec![TokenType::UrlNoScheme("example.org/¢/")], + ), + ( + "xn--c1h.example.com/", + vec![TokenType::UrlNoScheme("xn--c1h.example.com/")], + ), + ( + "example.", + vec![ + TokenType::Alphabetic("example"), + TokenType::Punctuation('.'), + ], + ), + ( + "example./", + vec![ + TokenType::Alphabetic("example"), + TokenType::Punctuation('.'), + TokenType::Punctuation('/'), + ], + ), + ( + "foo.com.", + vec![ + TokenType::UrlNoScheme("foo.com"), + TokenType::Punctuation('.'), + ], + ), + ( + "example.c", + vec![ + TokenType::Alphabetic("example"), + TokenType::Punctuation('.'), + TokenType::Hexadecimal("c"), + ], + ), + ("example.co", vec![TokenType::UrlNoScheme("example.co")]), + ("example.com", vec![TokenType::UrlNoScheme("example.com")]), + ("e.com", vec![TokenType::UrlNoScheme("e.com")]), + ( + "exampl.e.c", + vec![ + TokenType::Alphabetic("exampl"), + TokenType::Punctuation('.'), + TokenType::Hexadecimal("e"), + TokenType::Punctuation('.'), + TokenType::Hexadecimal("c"), + ], + ), + ("exampl.e.co", vec![TokenType::UrlNoScheme("exampl.e.co")]), + ( + "e.xample.c", + vec![ + TokenType::Hexadecimal("e"), + TokenType::Punctuation('.'), + TokenType::Alphabetic("xample"), + TokenType::Punctuation('.'), + TokenType::Hexadecimal("c"), + ], + ), + ("e.xample.co", vec![TokenType::UrlNoScheme("e.xample.co")]), + ( + "v1.1.1", + vec![ + TokenType::Alphanumeric("v1"), + TokenType::Punctuation('.'), + TokenType::Integer("1"), + TokenType::Punctuation('.'), + TokenType::Integer("1"), + ], + ), + ( + "foo.bar@example.org", + vec![TokenType::Email("foo.bar@example.org")], + ), + ( + "example.com@example.com", + vec![TokenType::Email("example.com@example.com")], + ), + ( + "Look, no scheme: example.org/foo email@foo.com", + vec![ + TokenType::Alphabetic("Look"), + TokenType::Punctuation(','), + TokenType::Space, + TokenType::Alphabetic("no"), + TokenType::Space, + TokenType::Alphabetic("scheme"), + TokenType::Punctuation(':'), + TokenType::Space, + TokenType::UrlNoScheme("example.org/foo"), + TokenType::Space, + TokenType::Email("email@foo.com"), + ], + ), + ( + "Web:\nwww.foobar.co\nE-Mail:\n bar@foobar.co (bla bla bla)", + vec![ + TokenType::Alphabetic("Web"), + TokenType::Punctuation(':'), + TokenType::Space, + TokenType::UrlNoScheme("www.foobar.co"), + TokenType::Space, + TokenType::Hexadecimal("E"), + TokenType::Punctuation('-'), + TokenType::Alphabetic("Mail"), + TokenType::Punctuation(':'), + TokenType::Space, + TokenType::Email("bar@foobar.co"), + TokenType::Space, + TokenType::Punctuation('('), + TokenType::Alphabetic("bla"), + TokenType::Space, + TokenType::Alphabetic("bla"), + TokenType::Space, + TokenType::Alphabetic("bla"), + TokenType::Punctuation(')'), + ], + ), + ( + "upi://pay?pa=XXXXXXX&pn=XXXXX", + vec![TokenType::UrlNoHost("upi://pay?pa=XXXXXXX&pn=XXXXX")], + ), + ( + "https://example.org?pa=XXXXXXX&pn=XXXXX", + vec![TokenType::Url("https://example.org?pa=XXXXXXX&pn=XXXXX")], + ), + ( + "website https://domain.com", + vec![ + TokenType::Alphabetic("website"), + TokenType::Space, + TokenType::Url("https://domain.com"), + ], + ), + ("a12.b-c.com", vec![TokenType::UrlNoScheme("a12.b-c.com")]), + ( + "v1.2.3", + vec![ + TokenType::Alphanumeric("v1"), + TokenType::Punctuation('.'), + TokenType::Integer("2"), + TokenType::Punctuation('.'), + TokenType::Integer("3"), + ], + ), + ( + "https://12-7.0.0.1/", + vec![TokenType::UrlNoHost("https://12-7.0.0.1/")], + ), + ( + "https://user:pass@example.com/", + vec![TokenType::Url("https://user:pass@example.com/")], + ), + ( + "https://user:-.!$@example.com/", + vec![TokenType::Url("https://user:-.!$@example.com/")], + ), + ( + "https://user:!$&'()*+,;=@example.com/", + vec![TokenType::Url("https://user:!$&'()*+,;=@example.com/")], + ), + ( + "https://user:pass@ex@mple.com/", + vec![ + TokenType::UrlNoHost("https://user:pass@ex"), + TokenType::Punctuation('@'), + TokenType::UrlNoScheme("mple.com/"), + ], + ), + ( + "https://localhost:8080!", + vec![ + TokenType::UrlNoHost("https://localhost:8080"), + TokenType::Punctuation('!'), + ], + ), + ( + "https://localhost:8080/", + vec![TokenType::UrlNoHost("https://localhost:8080/")], + ), + ( + "https://user:pass@example.com:8080/hi", + vec![TokenType::Url("https://user:pass@example.com:8080/hi")], + ), + ( + "https://127.0.0.1/", + vec![TokenType::Url("https://127.0.0.1/")], + ), + ("1.0.0.0", vec![TokenType::UrlNoScheme("1.0.0.0")]), + ( + "1.0.0.0/foo/bar", + vec![TokenType::UrlNoScheme("1.0.0.0/foo/bar")], + ), + ("1.0 ", vec![TokenType::Float("1.0"), TokenType::Space]), + ( + "1.0.0", + vec![ + TokenType::Integer("1"), + TokenType::Punctuation('.'), + TokenType::Integer("0"), + TokenType::Punctuation('.'), + TokenType::Integer("0"), + ], + ), + ( + "1.0.0.0.0", + vec![ + TokenType::Integer("1"), + TokenType::Punctuation('.'), + TokenType::UrlNoScheme("0.0.0.0"), + ], + ), + ( + "1.0.0.", + vec![ + TokenType::Integer("1"), + TokenType::Punctuation('.'), + TokenType::Integer("0"), + TokenType::Punctuation('.'), + TokenType::Integer("0"), + TokenType::Punctuation('.'), + ], + ), + ( + "https://example.com.:8080/test", + vec![TokenType::Url("https://example.com.:8080/test")], + ), + ( + "https://example.org'", + vec![ + TokenType::Url("https://example.org"), + TokenType::Punctuation('\''), + ], + ), + ( + "https://example.org'a@example.com", + vec![TokenType::Url("https://example.org'a@example.com")], + ), + ( + "https://a.com'https://b.com", + vec![ + TokenType::UrlNoHost("https://a.com'https"), + TokenType::Punctuation(':'), + TokenType::Punctuation('/'), + TokenType::Punctuation('/'), + TokenType::UrlNoScheme("b.com"), + ], + ), + ( + "https://example.com...", + vec![ + TokenType::Url("https://example.com"), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + ], + ), + ( + "www.example..com", + vec![ + TokenType::Alphabetic("www"), + TokenType::Punctuation('.'), + TokenType::Alphabetic("example"), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + TokenType::Alphabetic("com"), + ], + ), + ( + "https://.www.example.com", + vec![TokenType::Url("https://.www.example.com")], + ), + ( + "-a.com", + vec![TokenType::Punctuation('-'), TokenType::UrlNoScheme("a.com")], + ), + ("https://a.-b.com", vec![TokenType::Url("https://a.-b.com")]), + ( + "a-.com", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('-'), + TokenType::Punctuation('.'), + TokenType::Alphabetic("com"), + ], + ), + ( + "a.b-.com", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('.'), + TokenType::Hexadecimal("b"), + TokenType::Punctuation('-'), + TokenType::Punctuation('.'), + TokenType::Alphabetic("com"), + ], + ), + ("https://a.b-.com", vec![TokenType::Url("https://a.b-.com")]), + ( + "https://example.com-/", + vec![ + TokenType::Url("https://example.com"), + TokenType::Punctuation('-'), + TokenType::Punctuation('/'), + ], + ), + ( + "https://example.org-", + vec![ + TokenType::Url("https://example.org"), + TokenType::Punctuation('-'), + ], + ), + ( + "example.com@about", + vec![ + TokenType::UrlNoScheme("example.com"), + TokenType::Punctuation('@'), + TokenType::Alphabetic("about"), + ], + ), + ( + "example.com/@about", + vec![TokenType::UrlNoScheme("example.com/@about")], + ), + ( + "https://example.com/@about", + vec![TokenType::Url("https://example.com/@about")], + ), + ( + "info@v1.1.1", + vec![ + TokenType::Alphabetic("info"), + TokenType::Punctuation('@'), + TokenType::Alphanumeric("v1"), + TokenType::Punctuation('.'), + TokenType::Integer("1"), + TokenType::Punctuation('.'), + TokenType::Integer("1"), + ], + ), + ("file:///", vec![TokenType::UrlNoHost("file:///")]), + ( + "file:///home/foo", + vec![TokenType::UrlNoHost("file:///home/foo")], + ), + ( + "file://localhost/home/foo", + vec![TokenType::UrlNoHost("file://localhost/home/foo")], + ), + ( + "facetime://+19995551234", + vec![TokenType::UrlNoHost("facetime://+19995551234")], + ), + ( + "test://123'456!!!", + vec![ + TokenType::UrlNoHost("test://123'456"), + TokenType::Punctuation('!'), + TokenType::Punctuation('!'), + TokenType::Punctuation('!'), + ], + ), + ( + "test://123'456...", + vec![ + TokenType::UrlNoHost("test://123'456"), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + ], + ), + ( + "test://123'456!!!/", + vec![ + TokenType::UrlNoHost("test://123'456"), + TokenType::Punctuation('!'), + TokenType::Punctuation('!'), + TokenType::Punctuation('!'), + TokenType::Punctuation('/'), + ], + ), + ( + "test://123'456.../", + vec![ + TokenType::UrlNoHost("test://123'456"), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + TokenType::Punctuation('/'), + ], + ), + ( + "1abc://example.com", + vec![TokenType::Url("1abc://example.com")], + ), + ( + "¡¢example.com", + vec![TokenType::UrlNoScheme("¡¢example.com")], + ), + ("foo", vec![TokenType::Alphabetic("foo")]), + ("@", vec![TokenType::Punctuation('@')]), + ( + "a@", + vec![TokenType::Hexadecimal("a"), TokenType::Punctuation('@')], + ), + ( + "@a", + vec![TokenType::Punctuation('@'), TokenType::Hexadecimal("a")], + ), + ( + "@@@", + vec![ + TokenType::Punctuation('@'), + TokenType::Punctuation('@'), + TokenType::Punctuation('@'), + ], + ), + ("foo@example.com", vec![TokenType::Email("foo@example.com")]), + ( + "foo.bar@example.com", + vec![TokenType::Email("foo.bar@example.com")], + ), + ( + "#!$%&'*+-/=?^_`{}|~@example.org", + vec![TokenType::Email("#!$%&'*+-/=?^_`{}|~@example.org")], + ), + ( + "foo a@b.com", + vec![ + TokenType::Alphabetic("foo"), + TokenType::Space, + TokenType::Email("a@b.com"), + ], + ), + ( + "a@b.com foo", + vec![ + TokenType::Email("a@b.com"), + TokenType::Space, + TokenType::Alphabetic("foo"), + ], + ), + ( + "\na@b.com", + vec![TokenType::Space, TokenType::Email("a@b.com")], + ), + ( + "a@b.com\n", + vec![TokenType::Email("a@b.com"), TokenType::Space], + ), + ( + "(a@example.com)", + vec![ + TokenType::Punctuation('('), + TokenType::Email("a@example.com"), + TokenType::Punctuation(')'), + ], + ), + ( + "\"a@example.com\"", + vec![ + TokenType::Punctuation('"'), + TokenType::Email("a@example.com"), + TokenType::Punctuation('"'), + ], + ), + ( + "\"a@example.com\"", + vec![ + TokenType::Punctuation('"'), + TokenType::Email("a@example.com"), + TokenType::Punctuation('"'), + ], + ), + ( + ",a@example.com,", + vec![ + TokenType::Punctuation(','), + TokenType::Email("a@example.com"), + TokenType::Punctuation(','), + ], + ), + ( + ":a@example.com:", + vec![ + TokenType::Punctuation(':'), + TokenType::Email("a@example.com"), + TokenType::Punctuation(':'), + ], + ), + ( + ";a@example.com;", + vec![ + TokenType::Punctuation(';'), + TokenType::Email("a@example.com"), + TokenType::Punctuation(';'), + ], + ), + ( + ".@example.com", + vec![ + TokenType::Punctuation('.'), + TokenType::Punctuation('@'), + TokenType::UrlNoScheme("example.com"), + ], + ), + ( + "foo.@example.com", + vec![ + TokenType::Alphabetic("foo"), + TokenType::Punctuation('.'), + TokenType::Punctuation('@'), + TokenType::UrlNoScheme("example.com"), + ], + ), + ( + ".foo@example.com", + vec![ + TokenType::Punctuation('.'), + TokenType::Email("foo@example.com"), + ], + ), + ( + ".foo@example.com", + vec![ + TokenType::Punctuation('.'), + TokenType::Email("foo@example.com"), + ], + ), + ( + "a..b@example.com", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('.'), + TokenType::Punctuation('.'), + TokenType::Email("b@example.com"), + ], + ), + ( + "a@example.com.", + vec![ + TokenType::Email("a@example.com"), + TokenType::Punctuation('.'), + ], + ), + ( + "a@b", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('@'), + TokenType::Hexadecimal("b"), + ], + ), + ( + "a@b.", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('@'), + TokenType::Hexadecimal("b"), + TokenType::Punctuation('.'), + ], + ), + ( + "a@b.com.", + vec![TokenType::Email("a@b.com"), TokenType::Punctuation('.')], + ), + ( + "a@example.com-", + vec![ + TokenType::Email("a@example.com"), + TokenType::Punctuation('-'), + ], + ), + ("a@foo-bar.com", vec![TokenType::Email("a@foo-bar.com")]), + ( + "a@-foo.com", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('@'), + TokenType::Punctuation('-'), + TokenType::UrlNoScheme("foo.com"), + ], + ), + ( + "a@b-.", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('@'), + TokenType::Hexadecimal("b"), + TokenType::Punctuation('-'), + TokenType::Punctuation('.'), + ], + ), + ( + "a@b", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('@'), + TokenType::Hexadecimal("b"), + ], + ), + ( + "a@b.", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('@'), + TokenType::Hexadecimal("b"), + TokenType::Punctuation('.'), + ], + ), + ( + "a@example.com b@example.com", + vec![ + TokenType::Email("a@example.com"), + TokenType::Space, + TokenType::Email("b@example.com"), + ], + ), + ( + "a@example.com @ b@example.com", + vec![ + TokenType::Email("a@example.com"), + TokenType::Space, + TokenType::Punctuation('@'), + TokenType::Space, + TokenType::Email("b@example.com"), + ], + ), + ( + "a@xy.com;b@xy.com,c@xy.com", + vec![ + TokenType::Email("a@xy.com"), + TokenType::Punctuation(';'), + TokenType::Email("b@xy.com"), + TokenType::Punctuation(','), + TokenType::Email("c@xy.com"), + ], + ), + ( + "üñîçøðé@example.com", + vec![TokenType::Email("üñîçøðé@example.com")], + ), + ( + "üñîçøðé@üñîçøðé.com", + vec![TokenType::Email("üñîçøðé@üñîçøðé.com")], + ), + ("www@example.com", vec![TokenType::Email("www@example.com")]), + ( + "a@a.xyϸ", + vec![ + TokenType::Hexadecimal("a"), + TokenType::Punctuation('@'), + TokenType::Hexadecimal("a"), + TokenType::Punctuation('.'), + TokenType::Alphabetic("xyϸ"), + ], + ), + ( + "100 -100 100.00 -100.00 $100 $100.00", + vec![ + TokenType::Integer("100"), + TokenType::Space, + TokenType::Integer("-100"), + TokenType::Space, + TokenType::Float("100.00"), + TokenType::Space, + TokenType::Float("-100.00"), + TokenType::Space, + TokenType::Punctuation('$'), + TokenType::Integer("100"), + TokenType::Space, + TokenType::Punctuation('$'), + TokenType::Float("100.00"), + ], + ), + ( + " - 100 100 . 00", + vec![ + TokenType::Space, + TokenType::Punctuation('-'), + TokenType::Space, + TokenType::Integer("100"), + TokenType::Space, + TokenType::Integer("100"), + TokenType::Space, + TokenType::Punctuation('.'), + TokenType::Space, + TokenType::Integer("00"), + ], + ), + ( + "send $100.00 to user@domain.com or visit domain.com/pay-me!", + vec![ + TokenType::Alphabetic("send"), + TokenType::Space, + TokenType::Punctuation('$'), + TokenType::Float("100.00"), + TokenType::Space, + TokenType::Alphabetic("to"), + TokenType::Space, + TokenType::Email("user@domain.com"), + TokenType::Space, + TokenType::Alphabetic("or"), + TokenType::Space, + TokenType::Alphabetic("visit"), + TokenType::Space, + TokenType::UrlNoScheme("domain.com/pay-me"), + TokenType::Punctuation('!'), + ], + ), + ] { + let result = TypesTokenizer::new(text, &suffixes) + .map(|t| t.word) + .collect::>(); + + assert_eq!(result, expected); + + /*print!("({text:?}, "); + print!("vec!["); + for (pos, item) in result.into_iter().enumerate() { + if pos > 0 { + print!(", "); + } + print!("TokenType::{:?}", item); + } + println!("]),");*/ + } + } +} diff --git a/crates/store/src/fts/tokenizers/indo_european.rs b/crates/nlp/src/tokenizers/word.rs similarity index 94% rename from crates/store/src/fts/tokenizers/indo_european.rs rename to crates/nlp/src/tokenizers/word.rs index e1f34ce6..26854fbf 100644 --- a/crates/store/src/fts/tokenizers/indo_european.rs +++ b/crates/nlp/src/tokenizers/word.rs @@ -21,19 +21,19 @@ * for more details. */ -use std::str::CharIndices; +use std::{borrow::Cow, str::CharIndices}; use super::Token; -pub struct IndoEuropeanTokenizer<'x> { +pub struct WordTokenizer<'x> { max_token_length: usize, text: &'x str, iterator: CharIndices<'x>, } -impl<'x> IndoEuropeanTokenizer<'x> { - pub fn new(text: &str, max_token_length: usize) -> IndoEuropeanTokenizer { - IndoEuropeanTokenizer { +impl<'x> WordTokenizer<'x> { + pub fn new(text: &str, max_token_length: usize) -> WordTokenizer { + WordTokenizer { max_token_length, text, iterator: text.char_indices(), @@ -42,8 +42,8 @@ impl<'x> IndoEuropeanTokenizer<'x> { } /// Parses indo-european text into lowercase tokens. -impl<'x> Iterator for IndoEuropeanTokenizer<'x> { - type Item = Token<'x>; +impl<'x> Iterator for WordTokenizer<'x> { + type Item = Token>; fn next(&mut self) -> Option { while let Some((token_start, ch)) = self.iterator.next() { @@ -159,7 +159,7 @@ mod tests { ]; for (input, tokens) in inputs.iter() { - for (pos, token) in IndoEuropeanTokenizer::new(input, 40).enumerate() { + for (pos, token) in WordTokenizer::new(input, 40).enumerate() { assert_eq!(token, tokens[pos]); } } diff --git a/crates/store/src/fts/ngram.rs b/crates/nlp/src/transformers/mod.rs similarity index 53% rename from crates/store/src/fts/ngram.rs rename to crates/nlp/src/transformers/mod.rs index 2ca2c781..1d2d365b 100644 --- a/crates/store/src/fts/ngram.rs +++ b/crates/nlp/src/transformers/mod.rs @@ -21,41 +21,4 @@ * for more details. */ -use std::borrow::Cow; - -use super::bloom::{BloomFilter, BloomHashGroup}; - -pub trait ToNgrams: Sized { - fn new(items: usize) -> Self; - fn insert(&mut self, item: &str); - fn to_ngrams(tokens: &[Cow<'_, str>], n: usize) -> Self { - let mut filter = Self::new(tokens.len().saturating_sub(1)); - for words in tokens.windows(n) { - filter.insert(&words.join(" ")); - } - filter - } -} - -impl ToNgrams for BloomFilter { - fn new(items: usize) -> Self { - BloomFilter::new(items) - } - - fn insert(&mut self, item: &str) { - self.insert(&item.into()) - } -} - -impl ToNgrams for Vec { - fn new(items: usize) -> Self { - Vec::with_capacity(items) - } - - fn insert(&mut self, item: &str) { - self.push(BloomHashGroup { - h1: item.into(), - h2: None, - }) - } -} +pub mod osb; diff --git a/crates/nlp/src/transformers/osb.rs b/crates/nlp/src/transformers/osb.rs new file mode 100644 index 00000000..0c87132d --- /dev/null +++ b/crates/nlp/src/transformers/osb.rs @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2023 Stalwart Labs Ltd. + * + * This file is part of the Stalwart Mail Server. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * in the LICENSE file at the top-level directory of this distribution. + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + * You can be released from the requirements of the AGPLv3 license by + * purchasing a commercial license. Please contact licensing@stalw.art + * for more details. +*/ + +use std::iter::Peekable; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OsbToken { + pub inner: T, + pub idx: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Gram<'x> { + Uni { t1: &'x str }, + Bi { t1: &'x str, t2: &'x str }, +} + +pub struct OsbTokenizer<'x, I> +where + I: Iterator, +{ + iter: Peekable, + buf: Vec>, + window_size: usize, + window_pos: usize, + window_idx: usize, +} + +impl<'x, I> OsbTokenizer<'x, I> +where + I: Iterator, +{ + pub fn new(iter: I, window_size: usize) -> Self { + Self { + iter: iter.peekable(), + buf: vec![None; window_size], + window_pos: 0, + window_idx: 0, + window_size, + } + } +} + +impl<'x, I> Iterator for OsbTokenizer<'x, I> +where + I: Iterator, +{ + type Item = OsbToken>; + + fn next(&mut self) -> Option { + let end_pos = (self.window_pos + self.window_idx) % self.window_size; + if self.buf[end_pos].is_none() { + self.buf[end_pos] = self.iter.next(); + } + + let t1 = self.buf[self.window_pos % self.window_size]?; + let token = OsbToken { + inner: if self.window_idx != 0 { + Gram::Bi { + t1, + t2: self.buf[end_pos]?, + } + } else { + Gram::Uni { t1 } + }, + idx: self.window_idx, + }; + + // Increment window + self.window_idx += 1; + if self.window_idx == self.window_size + || (self.iter.peek().is_none() + && self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none()) + { + self.buf[self.window_pos % self.window_size] = None; + self.window_idx = 0; + self.window_pos += 1; + } + + Some(token) + } +} + +#[cfg(test)] +mod test { + use crate::transformers::osb::{Gram, OsbToken}; + + #[test] + fn osb_tokenizer() { + assert_eq!( + super::OsbTokenizer::new( + "The quick brown fox jumps over the lazy dog and the lazy cat" + .split_ascii_whitespace(), + 5 + ) + .collect::>(), + vec![ + OsbToken { + inner: Gram::Uni { t1: "The" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "The", + t2: "quick" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "The", + t2: "brown" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "The", + t2: "fox" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "The", + t2: "jumps" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "quick" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "quick", + t2: "brown" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "quick", + t2: "fox" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "quick", + t2: "jumps" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "quick", + t2: "over" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "brown" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "brown", + t2: "fox" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "brown", + t2: "jumps" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "brown", + t2: "over" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "brown", + t2: "the" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "fox" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "fox", + t2: "jumps" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "fox", + t2: "over" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "fox", + t2: "the" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "fox", + t2: "lazy" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "jumps" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "jumps", + t2: "over" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "jumps", + t2: "the" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "jumps", + t2: "lazy" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "jumps", + t2: "dog" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "over" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "over", + t2: "the" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "over", + t2: "lazy" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "over", + t2: "dog" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "over", + t2: "and" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "the" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "the", + t2: "lazy" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "the", + t2: "dog" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "the", + t2: "and" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "the", + t2: "the" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "lazy" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "lazy", + t2: "dog" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "lazy", + t2: "and" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "lazy", + t2: "the" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "lazy", + t2: "lazy" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "dog" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "dog", + t2: "and" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "dog", + t2: "the" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "dog", + t2: "lazy" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Bi { + t1: "dog", + t2: "cat" + }, + idx: 4 + }, + OsbToken { + inner: Gram::Uni { t1: "and" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "and", + t2: "the" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "and", + t2: "lazy" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Bi { + t1: "and", + t2: "cat" + }, + idx: 3 + }, + OsbToken { + inner: Gram::Uni { t1: "the" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "the", + t2: "lazy" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Bi { + t1: "the", + t2: "cat" + }, + idx: 2 + }, + OsbToken { + inner: Gram::Uni { t1: "lazy" }, + idx: 0 + }, + OsbToken { + inner: Gram::Bi { + t1: "lazy", + t2: "cat" + }, + idx: 1 + }, + OsbToken { + inner: Gram::Uni { t1: "cat" }, + idx: 0 + } + ] + ); + } +} diff --git a/crates/store/Cargo.toml b/crates/store/Cargo.toml index 9c4bb149..5a2dc3f5 100644 --- a/crates/store/Cargo.toml +++ b/crates/store/Cargo.toml @@ -6,6 +6,7 @@ resolver = "2" [dependencies] utils = { path = "../utils" } +nlp = { path = "../nlp" } maybe-async = { path = "../maybe-async" } rocksdb = { version = "0.20.1", optional = true } foundationdb = { version = "0.8.0", features = ["embedded-fdb-include"], optional = true } @@ -21,13 +22,9 @@ serde = { version = "1.0", features = ["derive"]} ahash = { version = "0.8.0", features = ["serde"] } bitpacking = "0.8.4" lazy_static = "1.4" -whatlang = "0.16" # Language detection -rust-stemmers = "1.2" # Stemmers -tinysegmenter = "0.1" # Japanese tokenizer -jieba-rs = "0.6" # Chinese stemmer xxhash-rust = { version = "0.8.5", features = ["xxh3"] } farmhash = "1.1.5" -siphasher = "0.3" +siphasher = "1.0" parking_lot = "0.12.1" lru-cache = { version = "0.1.2", optional = true } num_cpus = { version = "1.15.0", optional = true } diff --git a/crates/store/src/fts/bloom.rs b/crates/store/src/fts/bloom.rs index 54905458..31e36427 100644 --- a/crates/store/src/fts/bloom.rs +++ b/crates/store/src/fts/bloom.rs @@ -27,13 +27,12 @@ use std::{ hash::{Hash, Hasher}, }; +use nlp::{language::stemmer::StemmedToken, tokenizers::Token}; use roaring::RoaringBitmap; use utils::codec::leb128::{Leb128Reader, Leb128Vec}; use crate::{Deserialize, Error, Serialize}; -use super::{stemmer::StemmedToken, tokenizers::Token}; - pub struct BloomFilter { m: u64, b: RoaringBitmap, @@ -204,8 +203,8 @@ impl From> for BloomHash { } } -impl From> for BloomHashGroup { - fn from(t: Token<'_>) -> Self { +impl From>> for BloomHashGroup { + fn from(t: Token>) -> Self { Self { h1: BloomHash::hash(t.word.as_ref()), h2: None, diff --git a/crates/store/src/fts/builder.rs b/crates/store/src/fts/builder.rs index 3ddf538f..508d1e87 100644 --- a/crates/store/src/fts/builder.rs +++ b/crates/store/src/fts/builder.rs @@ -24,6 +24,14 @@ use std::{borrow::Cow, collections::HashSet}; use ahash::AHashSet; +use nlp::{ + language::{ + detect::{LanguageDetector, MIN_LANGUAGE_SCORE}, + stemmer::Stemmer, + Language, + }, + tokenizers::{space::SpaceTokenizer, Token}, +}; use utils::map::vec_map::VecMap; use crate::{ @@ -32,13 +40,7 @@ use crate::{ Serialize, HASH_EXACT, HASH_STEMMED, }; -use super::{ - lang::{LanguageDetector, MIN_LANGUAGE_SCORE}, - stemmer::Stemmer, - term_index::{TermIndexBuilder, TokenIndex}, - tokenizers::{space::SpaceTokenizer, Token}, - Language, -}; +use super::term_index::{TermIndexBuilder, TokenIndex}; pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize; pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1; @@ -138,8 +140,8 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> { ops.insert(Operation::hash(&token, HASH_EXACT, field, true)); terms.push(term_index.add_token(Token { word: token.into(), - offset: 0, - len: 0, + from: 0, + to: 0, })); } term_index.add_terms(field, 0, terms); diff --git a/crates/store/src/fts/mod.rs b/crates/store/src/fts/mod.rs index 3f3d0b9e..8761f076 100644 --- a/crates/store/src/fts/mod.rs +++ b/crates/store/src/fts/mod.rs @@ -26,149 +26,13 @@ use crate::{ BitmapKey, Serialize, BM_HASH, }; -use self::{bloom::hash_token, builder::MAX_TOKEN_MASK, lang::LanguageDetector}; +use self::{bloom::hash_token, builder::MAX_TOKEN_MASK}; -pub mod lang; -//pub mod pdf; pub mod bloom; pub mod builder; -pub mod ngram; pub mod query; pub mod search_snippet; -pub mod stemmer; pub mod term_index; -pub mod tokenizers; - -#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)] -pub enum Language { - Esperanto = 0, - English = 1, - Russian = 2, - Mandarin = 3, - Spanish = 4, - Portuguese = 5, - Italian = 6, - Bengali = 7, - French = 8, - German = 9, - Ukrainian = 10, - Georgian = 11, - Arabic = 12, - Hindi = 13, - Japanese = 14, - Hebrew = 15, - Yiddish = 16, - Polish = 17, - Amharic = 18, - Javanese = 19, - Korean = 20, - Bokmal = 21, - Danish = 22, - Swedish = 23, - Finnish = 24, - Turkish = 25, - Dutch = 26, - Hungarian = 27, - Czech = 28, - Greek = 29, - Bulgarian = 30, - Belarusian = 31, - Marathi = 32, - Kannada = 33, - Romanian = 34, - Slovene = 35, - Croatian = 36, - Serbian = 37, - Macedonian = 38, - Lithuanian = 39, - Latvian = 40, - Estonian = 41, - Tamil = 42, - Vietnamese = 43, - Urdu = 44, - Thai = 45, - Gujarati = 46, - Uzbek = 47, - Punjabi = 48, - Azerbaijani = 49, - Indonesian = 50, - Telugu = 51, - Persian = 52, - Malayalam = 53, - Oriya = 54, - Burmese = 55, - Nepali = 56, - Sinhalese = 57, - Khmer = 58, - Turkmen = 59, - Akan = 60, - Zulu = 61, - Shona = 62, - Afrikaans = 63, - Latin = 64, - Slovak = 65, - Catalan = 66, - Tagalog = 67, - Armenian = 68, - Unknown = 69, - None = 70, -} - -impl Language { - pub fn from_iso_639(code: &str) -> Option { - match code.split_once('-').map(|c| c.0).unwrap_or(code) { - "en" => Language::English, - "es" => Language::Spanish, - "pt" => Language::Portuguese, - "it" => Language::Italian, - "fr" => Language::French, - "de" => Language::German, - "ru" => Language::Russian, - "zh" => Language::Mandarin, - "ja" => Language::Japanese, - "ar" => Language::Arabic, - "hi" => Language::Hindi, - "ko" => Language::Korean, - "bn" => Language::Bengali, - "he" => Language::Hebrew, - "ur" => Language::Urdu, - "fa" => Language::Persian, - "ml" => Language::Malayalam, - "or" => Language::Oriya, - "my" => Language::Burmese, - "ne" => Language::Nepali, - "si" => Language::Sinhalese, - "km" => Language::Khmer, - "tk" => Language::Turkmen, - "am" => Language::Amharic, - "az" => Language::Azerbaijani, - "id" => Language::Indonesian, - "te" => Language::Telugu, - "ta" => Language::Tamil, - "vi" => Language::Vietnamese, - "gu" => Language::Gujarati, - "pa" => Language::Punjabi, - "uz" => Language::Uzbek, - "hy" => Language::Armenian, - "ka" => Language::Georgian, - "la" => Language::Latin, - "sl" => Language::Slovene, - "hr" => Language::Croatian, - "sr" => Language::Serbian, - "mk" => Language::Macedonian, - "lt" => Language::Lithuanian, - "lv" => Language::Latvian, - "et" => Language::Estonian, - "tl" => Language::Tagalog, - "af" => Language::Afrikaans, - "zu" => Language::Zulu, - "sn" => Language::Shona, - "ak" => Language::Akan, - _ => return None, - } - .into() - } -} impl BitmapKey> { pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self { @@ -209,19 +73,3 @@ impl Operation { } } } - -impl Language { - pub fn detect(text: String, default: Language) -> (String, Language) { - if let Some((l, t)) = text - .split_once(':') - .and_then(|(l, t)| (Language::from_iso_639(l)?, t).into()) - { - (t.to_string(), l) - } else { - let l = LanguageDetector::detect_single(&text) - .and_then(|(l, c)| if c > 0.3 { Some(l) } else { None }) - .unwrap_or(default); - (text, l) - } - } -} diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs index 09439d30..77bc4dbd 100644 --- a/crates/store/src/fts/query.rs +++ b/crates/store/src/fts/query.rs @@ -21,14 +21,14 @@ * for more details. */ +use nlp::language::{stemmer::Stemmer, Language}; use roaring::RoaringBitmap; use crate::{ - fts::{builder::MAX_TOKEN_LENGTH, stemmer::Stemmer, tokenizers::Tokenizer}, - BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED, + fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED, }; -use super::{term_index::TermIndex, Language}; +use super::term_index::TermIndex; impl ReadTransaction<'_> { #[maybe_async::maybe_async] @@ -44,7 +44,7 @@ impl ReadTransaction<'_> { if match_phrase { let mut phrase = Vec::new(); let mut bit_keys = Vec::new(); - for token in Tokenizer::new(text, language, MAX_TOKEN_LENGTH) { + for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) { let key = BitmapKey::hash( token.word.as_ref(), account_id, diff --git a/crates/store/src/fts/search_snippet.rs b/crates/store/src/fts/search_snippet.rs index 89c557b1..55d6b6b7 100644 --- a/crates/store/src/fts/search_snippet.rs +++ b/crates/store/src/fts/search_snippet.rs @@ -134,12 +134,10 @@ pub fn generate_snippet(terms: &[Term], text: &str) -> Option { #[cfg(test)] mod tests { + use nlp::language::Language; + use crate::{ - fts::{ - term_index::{TermIndex, TermIndexBuilder}, - tokenizers::Tokenizer, - Language, - }, + fts::term_index::{TermIndex, TermIndexBuilder}, Deserialize, Serialize, }; @@ -242,7 +240,7 @@ mod tests { for (field_num, part) in parts.iter().enumerate() { let mut terms = Vec::new(); - for token in Tokenizer::new(part, Language::English, 40) { + for token in Language::English.tokenize_text(part, 40) { terms.push(builder.add_token(token)); } builder.add_terms(field_num as u8, 0, terms); diff --git a/crates/store/src/fts/term_index.rs b/crates/store/src/fts/term_index.rs index e2653853..b91f74db 100644 --- a/crates/store/src/fts/term_index.rs +++ b/crates/store/src/fts/term_index.rs @@ -21,14 +21,13 @@ * for more details. */ -use std::convert::TryInto; +use std::{borrow::Cow, convert::TryInto}; use crate::{Deserialize, Serialize}; -use super::{stemmer::StemmedToken, tokenizers::Token}; - use ahash::{AHashMap, AHashSet}; use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x}; +use nlp::{language::stemmer::StemmedToken, tokenizers::Token}; use utils::codec::leb128::{Leb128Reader, Leb128Vec}; #[derive(Debug)] @@ -227,7 +226,7 @@ impl TermIndexBuilder { } } - pub fn add_token(&mut self, token: Token) -> Term { + pub fn add_token(&mut self, token: Token>) -> Term { let id = self.terms.len() as u32; let id = self .terms @@ -236,8 +235,8 @@ impl TermIndexBuilder { Term { id: *id, id_stemmed: *id, - offset: token.offset, - len: token.len, + offset: token.from as u32, + len: (token.to - token.from) as u8, } } @@ -259,8 +258,8 @@ impl TermIndexBuilder { Term { id, id_stemmed, - offset: token.offset, - len: token.len, + offset: token.from as u32, + len: (token.to - token.from) as u8, } } @@ -775,13 +774,10 @@ impl TokenIndex { mod tests { use ahash::AHashMap; + use nlp::language::{stemmer::Stemmer, Language}; use crate::{ - fts::{ - stemmer::Stemmer, - term_index::{TermIndexBuilder, TokenIndex}, - Language, - }, + fts::term_index::{TermIndexBuilder, TokenIndex}, Deserialize, Serialize, }; diff --git a/crates/store/src/fts/tokenizers/chinese.rs b/crates/store/src/fts/tokenizers/chinese.rs deleted file mode 100644 index e741571d..00000000 --- a/crates/store/src/fts/tokenizers/chinese.rs +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::{borrow::Cow, vec::IntoIter}; - -use jieba_rs::Jieba; - -use super::{word::WordTokenizer, Token}; -use lazy_static::lazy_static; - -lazy_static! { - static ref JIEBA: Jieba = Jieba::new(); -} - -pub struct ChineseTokenizer<'x> { - word_tokenizer: WordTokenizer<'x>, - tokens: IntoIter<&'x str>, - token_offset: usize, - token_len: usize, - token_len_cur: usize, - max_token_length: usize, -} - -impl<'x> ChineseTokenizer<'x> { - pub fn new(text: &str, max_token_length: usize) -> ChineseTokenizer { - ChineseTokenizer { - word_tokenizer: WordTokenizer::new(text), - tokens: Vec::new().into_iter(), - max_token_length, - token_offset: 0, - token_len: 0, - token_len_cur: 0, - } - } -} - -impl<'x> Iterator for ChineseTokenizer<'x> { - type Item = Token<'x>; - - fn next(&mut self) -> Option { - loop { - if let Some(ch_token) = self.tokens.next() { - let offset_start = self.token_offset + self.token_len_cur; - self.token_len_cur += ch_token.len(); - - if ch_token.len() <= self.max_token_length { - return Token::new(offset_start, ch_token.len(), ch_token.into()).into(); - } - } else { - loop { - let (token, is_ascii) = self.word_tokenizer.next()?; - if !is_ascii { - let word = match token.word { - Cow::Borrowed(word) => word, - Cow::Owned(_) => unreachable!(), - }; - self.tokens = JIEBA.cut(word, false).into_iter(); - self.token_offset = token.offset as usize; - self.token_len = token.len as usize; - self.token_len_cur = 0; - break; - } else if token.len as usize <= self.max_token_length { - return token.into(); - } - } - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn chinese_tokenizer() { - assert_eq!( - ChineseTokenizer::new( - "孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。", - 40 - ) - .collect::>(), - vec![ - Token { - word: "孫".into(), - offset: 0, - len: 3 - }, - Token { - word: "子".into(), - offset: 3, - len: 3 - }, - Token { - word: "曰".into(), - offset: 6, - len: 3 - }, - Token { - word: "兵".into(), - offset: 12, - len: 3 - }, - Token { - word: "者".into(), - offset: 15, - len: 3 - }, - Token { - word: "國".into(), - offset: 21, - len: 3 - }, - Token { - word: "之".into(), - offset: 24, - len: 3 - }, - Token { - word: "大事".into(), - offset: 27, - len: 6 - }, - Token { - word: "死".into(), - offset: 36, - len: 3 - }, - Token { - word: "生".into(), - offset: 39, - len: 3 - }, - Token { - word: "之".into(), - offset: 42, - len: 3 - }, - Token { - word: "地".into(), - offset: 45, - len: 3 - }, - Token { - word: "存亡".into(), - offset: 51, - len: 6 - }, - Token { - word: "之".into(), - offset: 57, - len: 3 - }, - Token { - word: "道".into(), - offset: 60, - len: 3 - }, - Token { - word: "不可不".into(), - offset: 66, - len: 9 - }, - Token { - word: "察".into(), - offset: 75, - len: 3 - }, - Token { - word: "也".into(), - offset: 78, - len: 3 - } - ] - ); - } -} diff --git a/crates/store/src/fts/tokenizers/japanese.rs b/crates/store/src/fts/tokenizers/japanese.rs deleted file mode 100644 index 816ba0a3..00000000 --- a/crates/store/src/fts/tokenizers/japanese.rs +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::vec::IntoIter; - -use super::{word::WordTokenizer, Token}; - -pub struct JapaneseTokenizer<'x> { - word_tokenizer: WordTokenizer<'x>, - tokens: IntoIter, - token_offset: usize, - token_len: usize, - token_len_cur: usize, - max_token_length: usize, -} - -impl<'x> JapaneseTokenizer<'x> { - pub fn new(text: &str, max_token_length: usize) -> JapaneseTokenizer { - JapaneseTokenizer { - word_tokenizer: WordTokenizer::new(text), - tokens: Vec::new().into_iter(), - max_token_length, - token_offset: 0, - token_len: 0, - token_len_cur: 0, - } - } -} - -impl<'x> Iterator for JapaneseTokenizer<'x> { - type Item = Token<'x>; - - fn next(&mut self) -> Option { - loop { - if let Some(jp_token) = self.tokens.next() { - let offset_start = self.token_offset + self.token_len_cur; - self.token_len_cur += jp_token.len(); - - if jp_token.len() <= self.max_token_length { - return Token::new(offset_start, jp_token.len(), jp_token.into()).into(); - } - } else { - loop { - let (token, is_ascii) = self.word_tokenizer.next()?; - if !is_ascii { - self.tokens = tinysegmenter::tokenize(token.word.as_ref()).into_iter(); - self.token_offset = token.offset as usize; - self.token_len = token.len as usize; - self.token_len_cur = 0; - break; - } else if token.len as usize <= self.max_token_length { - return token.into(); - } - } - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn japanese_tokenizer() { - assert_eq!( - JapaneseTokenizer::new("お先に失礼します あなたの名前は何ですか 123 abc-872", 40) - .collect::>(), - vec![ - Token { - word: "お先".into(), - offset: 0, - len: 6 - }, - Token { - word: "に".into(), - offset: 6, - len: 3 - }, - Token { - word: "失礼".into(), - offset: 9, - len: 6 - }, - Token { - word: "し".into(), - offset: 15, - len: 3 - }, - Token { - word: "ます".into(), - offset: 18, - len: 6 - }, - Token { - word: "あなた".into(), - offset: 25, - len: 9 - }, - Token { - word: "の".into(), - offset: 34, - len: 3 - }, - Token { - word: "名前".into(), - offset: 37, - len: 6 - }, - Token { - word: "は".into(), - offset: 43, - len: 3 - }, - Token { - word: "何".into(), - offset: 46, - len: 3 - }, - Token { - word: "です".into(), - offset: 49, - len: 6 - }, - Token { - word: "か".into(), - offset: 55, - len: 3 - }, - Token { - word: "123".into(), - offset: 59, - len: 3 - }, - Token { - word: "abc".into(), - offset: 63, - len: 3 - }, - Token { - word: "872".into(), - offset: 67, - len: 3 - } - ] - ); - } -} diff --git a/crates/store/src/fts/tokenizers/mod.rs b/crates/store/src/fts/tokenizers/mod.rs deleted file mode 100644 index 3679b2b3..00000000 --- a/crates/store/src/fts/tokenizers/mod.rs +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -pub mod chinese; -pub mod indo_european; -pub mod japanese; -pub mod space; -pub mod word; - -use std::borrow::Cow; - -use self::{ - chinese::ChineseTokenizer, indo_european::IndoEuropeanTokenizer, japanese::JapaneseTokenizer, -}; - -use super::Language; - -#[derive(Debug, PartialEq, Eq)] -pub struct Token<'x> { - pub word: Cow<'x, str>, - pub offset: u32, // Word offset in the text part - pub len: u8, // Word length -} - -impl<'x> Token<'x> { - pub fn new(offset: usize, len: usize, word: Cow<'x, str>) -> Token<'x> { - debug_assert!(offset <= u32::max_value() as usize); - debug_assert!(len <= u8::max_value() as usize); - Token { - offset: offset as u32, - len: len as u8, - word, - } - } -} - -enum LanguageTokenizer<'x> { - IndoEuropean(IndoEuropeanTokenizer<'x>), - Japanese(JapaneseTokenizer<'x>), - Chinese(ChineseTokenizer<'x>), -} - -pub struct Tokenizer<'x> { - tokenizer: LanguageTokenizer<'x>, -} - -impl<'x> Tokenizer<'x> { - pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Self { - Tokenizer { - tokenizer: match language { - Language::Japanese => { - LanguageTokenizer::Japanese(JapaneseTokenizer::new(text, max_token_length)) - } - Language::Mandarin => { - LanguageTokenizer::Chinese(ChineseTokenizer::new(text, max_token_length)) - } - _ => LanguageTokenizer::IndoEuropean(IndoEuropeanTokenizer::new( - text, - max_token_length, - )), - }, - } - } -} - -impl<'x> Iterator for Tokenizer<'x> { - type Item = Token<'x>; - - fn next(&mut self) -> Option { - match &mut self.tokenizer { - LanguageTokenizer::IndoEuropean(tokenizer) => tokenizer.next(), - LanguageTokenizer::Chinese(tokenizer) => tokenizer.next(), - LanguageTokenizer::Japanese(tokenizer) => tokenizer.next(), - } - } -} diff --git a/crates/store/src/fts/tokenizers/word.rs b/crates/store/src/fts/tokenizers/word.rs deleted file mode 100644 index 3e50ba1a..00000000 --- a/crates/store/src/fts/tokenizers/word.rs +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2023, Stalwart Labs Ltd. - * - * This file is part of Stalwart Mail Server. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of - * the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * in the LICENSE file at the top-level directory of this distribution. - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - * You can be released from the requirements of the AGPLv3 license by - * purchasing a commercial license. Please contact licensing@stalw.art - * for more details. -*/ - -use std::str::CharIndices; - -use super::Token; - -pub struct WordTokenizer<'x> { - text: &'x str, - iterator: CharIndices<'x>, -} - -impl<'x> WordTokenizer<'x> { - pub fn new(text: &str) -> WordTokenizer { - WordTokenizer { - text, - iterator: text.char_indices(), - } - } -} - -/// Parses text into tokens, used by non-IndoEuropean tokenizers. -impl<'x> Iterator for WordTokenizer<'x> { - type Item = (Token<'x>, bool); - - fn next(&mut self) -> Option { - let mut is_ascii = true; - while let Some((token_start, ch)) = self.iterator.next() { - if ch.is_alphanumeric() { - let token_end = (&mut self.iterator) - .filter_map(|(pos, ch)| { - if ch.is_alphanumeric() { - if is_ascii && !ch.is_ascii() { - is_ascii = false; - } - None - } else { - pos.into() - } - }) - .next() - .unwrap_or(self.text.len()); - - let token_len = token_end - token_start; - if token_end > token_start { - return ( - Token::new( - token_start, - token_len, - self.text[token_start..token_end].into(), - ), - is_ascii, - ) - .into(); - } - } - } - None - } -} diff --git a/crates/store/src/query/filter.rs b/crates/store/src/query/filter.rs index 5b74a9ae..9e4b7109 100644 --- a/crates/store/src/query/filter.rs +++ b/crates/store/src/query/filter.rs @@ -24,12 +24,10 @@ use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; use ahash::HashSet; +use nlp::tokenizers::space::SpaceTokenizer; use roaring::RoaringBitmap; -use crate::{ - fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer}, - BitmapKey, ReadTransaction, Store, -}; +use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, Store}; use super::{Filter, ResultSet, TextMatch}; diff --git a/crates/store/src/query/mod.rs b/crates/store/src/query/mod.rs index 86f7eec9..05442caf 100644 --- a/crates/store/src/query/mod.rs +++ b/crates/store/src/query/mod.rs @@ -26,11 +26,10 @@ pub mod get; pub mod log; pub mod sort; +use nlp::language::Language; use roaring::RoaringBitmap; -use crate::{ - fts::Language, write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS, -}; +use crate::{write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS}; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Operator { diff --git a/crates/store/src/write/mod.rs b/crates/store/src/write/mod.rs index 48d8027a..44826133 100644 --- a/crates/store/src/write/mod.rs +++ b/crates/store/src/write/mod.rs @@ -23,11 +23,11 @@ use std::{collections::HashSet, slice::Iter, time::SystemTime}; +use nlp::tokenizers::space::SpaceTokenizer; use utils::codec::leb128::{Leb128Iterator, Leb128Vec}; use crate::{ - fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer}, - Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC, + fts::builder::MAX_TOKEN_LENGTH, Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC, }; use self::assert::AssertValue; diff --git a/tests/Cargo.toml b/tests/Cargo.toml index cce3410f..920cd28c 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -12,6 +12,7 @@ foundationdb = ["store/foundation"] [dependencies] store = { path = "../crates/store", features = ["test_mode"] } +nlp = { path = "../crates/nlp" } directory = { path = "../crates/directory" } jmap = { path = "../crates/jmap", features = ["test_mode"] } jmap_proto = { path = "../crates/jmap-proto" } diff --git a/tests/src/store/query.rs b/tests/src/store/query.rs index cd154593..7e1ab6c9 100644 --- a/tests/src/store/query.rs +++ b/tests/src/store/query.rs @@ -27,10 +27,11 @@ use std::{ }; use jmap_proto::types::keyword::Keyword; +use nlp::language::Language; use store::{ahash::AHashMap, query::sort::Pagination}; use store::{ - fts::{builder::FtsIndexBuilder, Language}, + fts::builder::FtsIndexBuilder, query::{Comparator, Filter}, write::{BatchBuilder, F_BITMAP, F_INDEX, F_VALUE}, Store, ValueKey,