Bayes classifier, type tokenizer and NLP module reorganization

2024-09-20 07:16:18 +08:00 · 2023-10-10 18:58:38 +02:00 · 2023-10-10 18:58:38 +02:00 · 3d9efd363a
parent a0812095ef
commit 3d9efd363a
53 changed files with 4651 additions and 944 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,7 +5,8 @@ All notable changes to this project will be documented in this file. This projec
 ## [0.3.9] - 2023-10-07

 ## Added
- Support for reading environment variables from configuration file using the `!ENV_VAR_NAME` special keyword.
+- Support for reading environment variables from the configuration file using the `!ENV_VAR_NAME` special keyword.
+- Option to disable ANSI color codes in logs.

 ### Changed
 - Querying directories from a Sieve script is now done using the `query()` method from `eval`. Your scripts will need to be updated, please refer to the [new syntax](https://stalw.art/docs/smtp/filter/sieve#directory-queries).
--- a/Cargo.lock
+++ b/Cargo.lock
@ -169,13 +169,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "antispam"
-version = "0.1.0"
-dependencies = [
- "fancy-regex",
-]
-
 [[package]]
 name = "anyhow"
 version = "1.0.75"
@ -1487,25 +1480,14 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"

 [[package]]
 name = "errno"
-version = "0.3.4"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "add4f07d43996f76ef320709726a556a9d4f965d9410d8d0271132d2f8293480"
+checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
 dependencies = [
- "errno-dragonfly",
 "libc",
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
- "libc",
-]
-
 [[package]]
 name = "etcetera"
 version = "0.8.0"
@ -2252,6 +2234,7 @@ dependencies = [
 "mail-parser",
 "mail-send",
 "md5",
+ "nlp",
 "parking_lot",
 "rustls 0.21.7",
 "rustls-pemfile",
@ -2450,6 +2433,7 @@ dependencies = [
 "mail-parser",
 "mail-send",
 "mime",
+ "nlp",
 "p256",
 "rand 0.8.5",
 "rasn",
@ -2510,9 +2494,9 @@ dependencies = [

 [[package]]
 name = "jobserver"
-version = "0.1.26"
+version = "0.1.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d"
 dependencies = [
 "libc",
 ]
@ -2703,9 +2687,9 @@ dependencies = [

 [[package]]
 name = "linux-raw-sys"
-version = "0.4.8"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db"
+checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"

 [[package]]
 name = "lock_api"
@ -2754,7 +2738,7 @@ dependencies = [
 "mail-parser",
 "parking_lot",
 "quick-xml 0.30.0",
- "ring 0.17.2",
+ "ring 0.17.3",
 "rustls-pemfile",
 "serde",
 "serde_json",
@ -3001,6 +2985,30 @@ dependencies = [
 "pin-utils",
 ]

+[[package]]
+name = "nlp"
+version = "0.3.9"
+dependencies = [
+ "ahash 0.8.3",
+ "bincode",
+ "farmhash",
+ "jieba-rs",
+ "lazy_static",
+ "nohash",
+ "rust-stemmers",
+ "serde",
+ "siphasher 1.0.0",
+ "tinysegmenter",
+ "whatlang",
+ "xxhash-rust",
+]
+
+[[package]]
+name = "nohash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0f889fb66f7acdf83442c35775764b51fed3c606ab9cee51500dbde2cf528ca"
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@ -3072,9 +3080,9 @@ dependencies = [

 [[package]]
 name = "num-traits"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
 dependencies = [
 "autocfg",
 "libm",
@ -3476,7 +3484,7 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
 dependencies = [
- "siphasher",
+ "siphasher 0.3.11",
 ]

 [[package]]
@ -3485,7 +3493,7 @@ version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
 dependencies = [
- "siphasher",
+ "siphasher 0.3.11",
 ]

 [[package]]
@ -3791,9 +3799,9 @@ dependencies = [

 [[package]]
 name = "rasn"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2cf5174961dbfd4f03b57e71e5a11b034f564d5f0b133d63e39d703ac3d2876b"
+checksum = "4addd1a49756bcb131c2f686c6c833d2b63e4da7a0df07efd8c3de04b7efbdb2"
 dependencies = [
 "arrayvec",
 "bitvec",
@ -3813,9 +3821,9 @@ dependencies = [

 [[package]]
 name = "rasn-cms"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56517898cf38bb50fdb6479049ed476510bf59ae7d329b35129dc8a8b309697f"
+checksum = "e269b4df6eea0f54abd46afacd759b1c13a27e98da98a47ef3c405ef3568b0f5"
 dependencies = [
 "rasn",
 "rasn-pkix",
@ -3823,9 +3831,9 @@ dependencies = [

 [[package]]
 name = "rasn-derive"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8def4ce07f970be91bad36c3090af419dcd9e696897ada3cf74bd480e0101d61"
+checksum = "ba8242a16e3461b81333516ad8457906f52fdf21d087417fb59262c9ab406618"
 dependencies = [
 "either",
 "itertools 0.10.5",
@ -3838,9 +3846,9 @@ dependencies = [

 [[package]]
 name = "rasn-pkix"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebdeef45b70d4c20ce34725707b2784c761eacaaa4d841eab46f9f9c6dc10dd3"
+checksum = "06179c947a63fe9f9f5d73a539dcb13d90c6bdaeb03bd28b90ad796aff9fe6a8"
 dependencies = [
 "rasn",
 ]
@ -4024,9 +4032,9 @@ dependencies = [

 [[package]]
 name = "ring"
-version = "0.17.2"
+version = "0.17.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "911b295d2d302948838c8ac142da1ee09fa7863163b44e6715bc9357905878b8"
+checksum = "9babe80d5c16becf6594aa32ad2be8fe08498e7ae60b77de8df700e67f191d7e"
 dependencies = [
 "cc",
 "getrandom 0.2.10",
@ -4198,9 +4206,9 @@ dependencies = [

 [[package]]
 name = "rustix"
-version = "0.38.17"
+version = "0.38.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7"
+checksum = "5a74ee2d7c2581cd139b42447d7d9389b889bdaad3a73f1ebb16f2a3237bb19c"
 dependencies = [
 "bitflags 2.4.0",
 "errno",
@ -4644,6 +4652,12 @@ version = "0.3.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"

+[[package]]
+name = "siphasher"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe"
+
 [[package]]
 name = "slab"
 version = "0.4.9"
@ -5048,10 +5062,10 @@ dependencies = [
 "farmhash",
 "foundationdb",
 "futures",
- "jieba-rs",
 "lazy_static",
 "lru-cache",
 "maybe-async 0.2.7",
+ "nlp",
 "num_cpus",
 "parking_lot",
 "r2d2",
@ -5061,14 +5075,11 @@ dependencies = [
 "rocksdb",
 "rusqlite",
 "rust-s3",
- "rust-stemmers",
 "serde",
- "siphasher",
- "tinysegmenter",
+ "siphasher 1.0.0",
 "tokio",
 "tracing",
 "utils",
- "whatlang",
 "xxhash-rust",
 ]

@ -5244,6 +5255,7 @@ dependencies = [
 "mail-parser",
 "mail-send",
 "managesieve",
+ "nlp",
 "num_cpus",
 "rayon",
 "reqwest",
@ -5358,9 +5370,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.32.0"
+version = "1.33.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9"
+checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653"
 dependencies = [
 "backtrace",
 "bytes",
@ -6040,12 +6052,12 @@ dependencies = [

 [[package]]
 name = "webpki"
-version = "0.22.2"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
+checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53"
 dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
+ "ring 0.17.3",
+ "untrusted 0.9.0",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,9 +8,9 @@ members = [
    "crates/imap-proto",
    "crates/smtp",
    "crates/managesieve",
+    "crates/nlp",
    "crates/store",
    "crates/directory",
-    "crates/antispam",
    "crates/utils",
    "crates/maybe-async",
    "crates/cli",
--- a/README.md
+++ b/README.md
@ -38,6 +38,7 @@ Key features:
  - OAuth 2.0 [authorization code](https://www.rfc-editor.org/rfc/rfc8628) and [device authorization](https://www.rfc-editor.org/rfc/rfc8628) flows.
  - Access Control Lists (ACLs).
  - Rate limiting.
+  - Security audited (read the [report](https://stalw.art/blog/security-audit)).
 - **Robust and scalable**:
  - **FoundationDB** or **SQLite** database backends.
  - **S3-compatible** blob storage support.
--- a/crates/antispam/Cargo.toml
+++ b/crates/antispam/Cargo.toml
@ -1,7 +0,0 @@
-[package]
-name = "antispam"
-version = "0.1.0"
-edition = "2021"
-
-[dependencies]
-fancy-regex = "0.11.0"
--- a/crates/antispam/src/main.rs
+++ b/crates/antispam/src/main.rs
@ -1,64 +0,0 @@
-use std::path::PathBuf;
-
-use import::spamassassin::import_spamassassin;
-
-pub mod import;
-
-fn main() {
-    import_spamassassin(
-        PathBuf::from("/Users/me/code/mail-server/resources/spamassassin"),
-        "cf".to_string(),
-        false,
-    );
-}
-
-const _IGNORE: &str = r#"
-
-[antispam]
-required-score = 5
-add-headers = ["X-Spam-Checker-Version: SpamAssassin _VERSION_ (_SUBVERSION_) on _HOSTNAME_",
- "X-Spam-Flag: _YESNOCAPS_", "X-Spam-Level: _STARS(*)_",
- "X-Spam-Status: _YESNO_, score=_SCORE_ required=_REQD_ tests=_TESTS_ autolearn=_AUTOLEARN_ version=_VERSION_"]
-originating-ip-headers = ["X-Yahoo-Post-IP", "X-Originating-IP", "X-Apparently-From",
- "X-SenderIP X-AOL-IP", "X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp"]
-rewrite-headers = ["Subject: [SPAM] _SUBJECT_"]
-redirect-patterns = ["""m'/(?:index.php)?\?.*(?<=[?&])URL=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/url\?.*?(?<=[?&])q=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:site|inurl):(.*?)(?:$|%20|[\s+&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:"|%22)(.*?)(?:$|%22|["\s+&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/translate\?.*?(?<=[?&])u=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/pagead/iclk\?.*?(?<=[?&])adurl=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:\w+\.)?aol\.com/redir\.adp\?.*(?<=[?&])_url=(.*?)(?:$|[&\#])'i""",
- """m'^https?/*(?:\w+\.)?facebook\.com/l/;(.*)'i""",
- """/^http:\/\/chkpt\.zdnet\.com\/chkpt\/\w+\/(.*)$/i""",
- """/^http:\/\/www(?:\d+)?\.nate\.com\/r\/\w+\/(.*)$/i""",
- """/^http:\/\/.+\.gov\/(?:.*\/)?externalLink\.jhtml\?.*url=(.*?)(?:&.*)?$/i""",
- """/^http:\/\/redir\.internet\.com\/.+?\/.+?\/(.*)$/i""",
- """/^http:\/\/(?:.*?\.)?adtech\.de\/.*(?:;|\|)link=(.*?)(?:;|$)/i""",
- """m'^http.*?/redirect\.php\?.*(?<=[?&])goto=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:[^/]+\.)?emf\d\.com/r\.cfm.*?&r=(.*)'i"""
-]
-
-[antispam.autolearn]
-enable = true
-ignore-headers = [ "X-ACL-Warn", "X-Alimail-AntiSpam", "X-Amavis-Modified", "X-Anti*", "X-aol-global-disposition",
- "X-ASF-*", "X-Assp-Version", "X-Authority-Analysis", "X-Authvirus", "X-Auto-Response-Suppress", "X-AV-Do-Run",
- "X-AV-Status", "X-avast-antispam", "X-Backend", "X-Barracuda*", "X-Bayes*", "X-BitDefender*", "X-BL", "X-Bogosity",
- "X-Boxtrapper", "X-Brightmail-Tracker", "X-BTI-AntiSpam", "X-Bugzilla-Version", "X-CanIt*", "X-Clapf-spamicity",
- "X-Cloud-Security", "X-CM-Score", "X-CMAE-*", "X-Company", "X-Coremail-Antispam", "X-CRM114-*", "X-CT-Spam",
- "X-CTCH-*", "X-Drweb-SpamState", "X-DSPAM*", "X-eavas*", "X-Enigmail-Version", "X-Eset*", "X-Exchange-Antispam-Report",
- "X-ExtloopSabreCommercials1", "X-EYOU-SPAMVALUE", "X-FB-OUTBOUND-SPAM", "X-FEAS-SBL", "X-FILTER-SCORE", "X-Forefront*",
- "X-Fuglu*", "X-getmail-filter-classifier", "X-GFIME-MASPAM", "X-Gmane-NNTP-Posting-Host", "X-GMX-Anti*", "X-He-Spam",
- "X-hMailServer-Spam", "X-IAS", "X-iGspam-global", "X-Injected-Via-Gmane", "X-Interia-Antivirus", "X-IP-Spam-Verdict",
- "X-Ironport*", "X-Junk*", "X-KLMS-*", "X-KMail-*", "X-MailCleaner-*", "X-MailFoundry", "X-MDMailLookup-Result",
- "X-ME-*", "X-MessageFilter", "X-Microsoft-Antispam", "X-Mlf-Version", "X-MXScan-*", "X-NAI-Spam-*", "X-NetStation-Status",
- "X-OVH-SPAM*", "X-PerlMx-*", "X-PFSI-Info", "X-PMX-*", "X-Policy-Service", "X-policyd-weight", "X-PreRBLs",
- "X-Probable-Spam", "X-PROLinux-SpamCheck", "X-Proofpoint-*", "x-purgate-*", "X-Qmail-Scanner-*", "X-Quarantine-ID",
- "X-RSpam-Report", "X-SA-*", "X-Scanned-by", "X-SmarterMail-CustomSpamHeader", "X-Spam*", "X-SPF-Scan-By", "X-STA-*",
- "X-StarScan-Version", "X-SurGATE-Result", "X-SWITCHham-Score", "X-UI-*", "X-Univie*", "X-Virus*", "X-VR-*",
- "X-WatchGuard*", "X-Whitelist-Domain", "X-WUM-CCI", "X_CMAE_Category" ]
-threshold.ham = 0.1
-threshold.spam = 12.0
-
-
-"#;
--- a/crates/cli/src/modules/antispam/mod.rs
+++ b/crates/cli/src/modules/antispam/mod.rs
--- a/crates/cli/src/modules/antispam/spamassassin.rs
+++ b/crates/cli/src/modules/antispam/spamassassin.rs
--- a/crates/cli/src/modules/antispam/tokenizer.rs
+++ b/crates/cli/src/modules/antispam/tokenizer.rs
--- a/crates/cli/src/modules/antispam/utils.rs
+++ b/crates/cli/src/modules/antispam/utils.rs
--- a/crates/imap/Cargo.toml
+++ b/crates/imap/Cargo.toml
@ -10,6 +10,7 @@ jmap = { path = "../jmap" }
 jmap_proto = { path = "../jmap-proto" }
 directory = { path = "../directory" }
 store = { path = "../store" }
+nlp = { path = "../nlp" }
 utils = { path = "../utils" }
 mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "ludicrous_mode"] } 
 mail-send = { git = "https://github.com/stalwartlabs/mail-send", default-features = false, features = ["cram-md5", "skip-ehlo"] }
--- a/crates/imap/src/op/search.rs
+++ b/crates/imap/src/op/search.rs
@ -34,8 +34,9 @@ use imap_proto::{

 use jmap_proto::types::{collection::Collection, id::Id, keyword::Keyword, property::Property};
 use mail_parser::HeaderName;
+use nlp::language::Language;
 use store::{
-    fts::{builder::MAX_TOKEN_LENGTH, Language},
+    fts::builder::MAX_TOKEN_LENGTH,
    query::{self, log::Query, sort::Pagination, ResultSet},
    roaring::RoaringBitmap,
    write::now,
--- a/crates/jmap/Cargo.toml
+++ b/crates/jmap/Cargo.toml
@ -6,6 +6,7 @@ resolver = "2"

 [dependencies]
 store = { path = "../store" }
+nlp = { path = "../nlp" }
 jmap_proto = { path = "../jmap-proto" }
 smtp = { path =  "../smtp" }
 utils = { path =  "../utils" }
--- a/crates/jmap/src/api/config.rs
+++ b/crates/jmap/src/api/config.rs
@ -23,10 +23,8 @@

 use std::{str::FromStr, time::Duration};

-use store::{
-    fts::Language,
-    rand::{distributions::Alphanumeric, thread_rng, Rng},
-};
+use nlp::language::Language;
+use store::rand::{distributions::Alphanumeric, thread_rng, Rng};

 use super::session::BaseCapabilities;

--- a/crates/jmap/src/email/index.rs
+++ b/crates/jmap/src/email/index.rs
@ -37,11 +37,9 @@ use mail_parser::{
    parsers::{fields::thread::thread_name, preview::preview_text},
    Addr, Address, GetHeader, Group, HeaderName, HeaderValue, Message, MessagePart, PartType,
 };
+use nlp::language::Language;
 use store::{
-    fts::{
-        builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
-        Language,
-    },
+    fts::builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
    write::{BatchBuilder, IntoOperations, F_BITMAP, F_CLEAR, F_INDEX, F_VALUE},
 };

--- a/crates/jmap/src/email/query.rs
+++ b/crates/jmap/src/email/query.rs
@ -28,8 +28,9 @@ use jmap_proto::{
    types::{acl::Acl, collection::Collection, keyword::Keyword, property::Property},
 };
 use mail_parser::HeaderName;
+use nlp::language::Language;
 use store::{
-    fts::{builder::MAX_TOKEN_LENGTH, Language},
+    fts::builder::MAX_TOKEN_LENGTH,
    query::{self},
    roaring::RoaringBitmap,
    ValueKey,
--- a/crates/jmap/src/email/snippet.rs
+++ b/crates/jmap/src/email/snippet.rs
@ -30,14 +30,12 @@ use jmap_proto::{
    types::{acl::Acl, collection::Collection},
 };
 use mail_parser::{decoders::html::html_to_text, MessageParser, PartType};
+use nlp::language::{stemmer::Stemmer, Language};
 use store::{
    fts::{
        builder::MAX_TOKEN_LENGTH,
        search_snippet::generate_snippet,
-        stemmer::Stemmer,
        term_index::{self, TermIndex},
-        tokenizers::Tokenizer,
-        Language,
    },
    BlobKind,
 };
@ -66,7 +64,8 @@ impl JMAP {
                            || (text.starts_with('\'') && text.ends_with('\''))
                        {
                            terms.push(
-                                Tokenizer::new(&text, language, MAX_TOKEN_LENGTH)
+                                language
+                                    .tokenize_text(&text, MAX_TOKEN_LENGTH)
                                    .map(|token| (token.word.into_owned(), None))
                                    .collect::<Vec<_>>(),
                            );
--- a/crates/jmap/src/lib.rs
+++ b/crates/jmap/src/lib.rs
@ -40,6 +40,7 @@ use jmap_proto::{
    },
    types::{collection::Collection, property::Property},
 };
+use nlp::language::Language;
 use services::{
    delivery::spawn_delivery_manager,
    housekeeper::{self, init_housekeeper, spawn_housekeeper},
@ -47,7 +48,6 @@ use services::{
 };
 use smtp::core::SMTP;
 use store::{
-    fts::Language,
    parking_lot::Mutex,
    query::{sort::Pagination, Comparator, Filter, ResultSet, SortedResultSet},
    roaring::RoaringBitmap,
--- a/crates/jmap/src/mailbox/query.rs
+++ b/crates/jmap/src/mailbox/query.rs
@ -27,9 +27,9 @@ use jmap_proto::{
    object::{mailbox::QueryArguments, Object},
    types::{acl::Acl, collection::Collection, property::Property, value::Value},
 };
+use nlp::language::Language;
 use store::{
    ahash::{AHashMap, AHashSet},
-    fts::Language,
    query::{self, sort::Pagination},
    roaring::RoaringBitmap,
 };
--- a/crates/jmap/src/sieve/query.rs
+++ b/crates/jmap/src/sieve/query.rs
@ -28,10 +28,8 @@ use jmap_proto::{
    },
    types::{collection::Collection, property::Property},
 };
-use store::{
-    fts::Language,
-    query::{self},
-};
+use nlp::language::Language;
+use store::query::{self};

 use crate::JMAP;

--- a/crates/nlp/Cargo.toml
+++ b/crates/nlp/Cargo.toml
@ -0,0 +1,19 @@
+[package]
+name = "nlp"
+version = "0.3.9"
+edition = "2021"
+resolver = "2"
+
+[dependencies]
+xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
+farmhash = "1.1.5"
+siphasher = "1.0"
+serde = { version = "1.0", features = ["derive"]}
+bincode = "1.3.3"
+nohash = "0.2.0"
+ahash = "0.8.3"
+lazy_static = "1.4"
+whatlang = "0.16" # Language detection
+rust-stemmers = "1.2" # Stemmers
+tinysegmenter = "0.1" # Japanese tokenizer
+jieba-rs = "0.6" # Chinese stemmer
--- a/crates/nlp/src/bayes/bloom.rs
+++ b/crates/nlp/src/bayes/bloom.rs
@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use nohash::IsEnabled;
+
+use crate::transformers::osb::{Gram, OsbToken};
+
+use super::TokenHash;
+
+pub struct BloomHasher<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> {
+    buf: Vec<u8>,
+    tokens: T,
+}
+
+impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> BloomHasher<'x, T> {
+    pub fn new(tokens: T) -> Self {
+        Self {
+            buf: Vec::with_capacity(64),
+            tokens,
+        }
+    }
+}
+
+impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> Iterator for BloomHasher<'x, T> {
+    type Item = OsbToken<TokenHash>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.tokens.next().map(|token| {
+            let bytes = match token.inner {
+                Gram::Uni { t1 } => t1.as_bytes(),
+                Gram::Bi { t1, t2, .. } => {
+                    self.buf.clear();
+                    self.buf.extend_from_slice(t1.as_bytes());
+                    self.buf.push(b' ');
+                    self.buf.extend_from_slice(t2.as_bytes());
+                    &self.buf
+                }
+            };
+
+            OsbToken {
+                inner: TokenHash {
+                    h1: xxhash_rust::xxh3::xxh3_64(bytes),
+                    h2: farmhash::hash64(bytes),
+                },
+                idx: token.idx,
+            }
+        })
+    }
+}
+
+impl std::hash::Hash for TokenHash {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_u64(self.h1 ^ self.h2);
+    }
+}
+
+impl IsEnabled for TokenHash {}
--- a/crates/nlp/src/bayes/classify.rs
+++ b/crates/nlp/src/bayes/classify.rs
@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use crate::transformers::osb::OsbToken;
+
+use super::{BayesClassifier, Weights};
+
+// Position 0 represents Unigram weights
+const FEATURE_WEIGHT: [f64; 8] = [1.0, 3125.0, 256.0, 27.0, 1.0, 0.0, 0.0, 0.0];
+
+impl BayesClassifier {
+    pub fn classify<T>(&self, tokens: T, ham_learns: u32, spam_learns: u32) -> Option<f64>
+    where
+        T: Iterator<Item = OsbToken<Weights>>,
+    {
+        if self.min_learns > 0 && (spam_learns < self.min_learns || ham_learns < self.min_learns) {
+            return None;
+        }
+
+        let mut processed_tokens = 0;
+        let mut total_spam_prob = 0.0;
+        let mut total_ham_prob = 0.0;
+
+        for token in tokens {
+            let weights = token.inner;
+            let total_count = weights.spam + weights.ham;
+
+            if total_count >= self.min_token_hits {
+                let total_count = total_count as f64;
+                let spam_freq = weights.spam as f64 / f64::max(1.0, spam_learns as f64);
+                let ham_freq = weights.ham as f64 / f64::max(1.0, ham_learns as f64);
+                let spam_prob = spam_freq / (spam_freq + ham_freq);
+                let ham_prob = ham_freq / (spam_freq + ham_freq);
+
+                let fw = FEATURE_WEIGHT[token.idx];
+                let w = (fw * total_count) / (1.0 + fw * total_count);
+                let bayes_spam_prob = prob_combine(spam_prob, total_count, w, 0.5);
+
+                if !((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + self.min_prob_strength)
+                    || (bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - self.min_prob_strength))
+                {
+                    let bayes_ham_prob = prob_combine(ham_prob, total_count, w, 0.5);
+                    total_spam_prob += bayes_spam_prob.ln();
+                    total_ham_prob += bayes_ham_prob.ln();
+                    processed_tokens += 1;
+                }
+            }
+        }
+
+        if processed_tokens == 0
+            || self.min_tokens > 0 && processed_tokens < (self.min_tokens as f64 * 0.1) as u32
+        {
+            return None;
+        }
+
+        let (h, s) = if total_spam_prob > -300.0 && total_ham_prob > -300.0 {
+            /* Fisher value is low enough to apply inv_chi_square */
+            (
+                1.0 - inv_chi_square(total_spam_prob, processed_tokens),
+                1.0 - inv_chi_square(total_ham_prob, processed_tokens),
+            )
+        } else {
+            /* Use naive method */
+            if total_spam_prob < total_ham_prob {
+                let h = (1.0 - (total_spam_prob - total_ham_prob).exp())
+                    / (1.0 + (total_spam_prob - total_ham_prob).exp());
+                (h, 1.0 - h)
+            } else {
+                let s = (1.0 - (total_ham_prob - total_spam_prob).exp())
+                    / (1.0 + (total_ham_prob - total_spam_prob).exp());
+                (1.0 - s, s)
+            }
+        };
+
+        let final_prob = if h.is_finite() && s.is_finite() {
+            (s + 1.0 - h) / 2.0
+        } else {
+            /*
+             * We have some overflow, hence we need to check which class
+             * is NaN
+             */
+
+            if h.is_finite() {
+                1.0
+            } else if s.is_finite() {
+                0.0
+            } else {
+                0.5
+            }
+        };
+
+        if processed_tokens > 0 && (final_prob - 0.5).abs() > 0.05 {
+            Some(final_prob)
+        } else {
+            None
+        }
+    }
+}
+
+/**
+ * Returns probability of chisquare > value with specified number of freedom
+ * degrees
+ */
+#[inline(always)]
+fn inv_chi_square(value: f64, freedom_deg: u32) -> f64 {
+    let mut prob = value.exp();
+
+    if prob.is_finite() {
+        /*
+         * m is our confidence in class
+         * prob is e ^ x (small value since x is normally less than zero
+         * So we integrate over degrees of freedom and produce the total result
+         * from 1.0 (no confidence) to 0.0 (full confidence)
+         */
+
+        let mut sum = prob;
+        let m = -value;
+
+        for i in 1..freedom_deg {
+            prob *= m / i as f64;
+            sum += prob;
+        }
+
+        f64::min(1.0, sum)
+    } else {
+        /*
+         * e^x where x is large *NEGATIVE* number is OK, so we have a very strong
+         * confidence that inv-chi-square is close to zero
+         */
+
+        if value < 0.0 {
+            0.0
+        } else {
+            1.0
+        }
+    }
+}
+
+/*#[inline(always)]
+fn normalize_probability(x: f64, bias: f64) -> f64 {
+    ((x - bias) * 2.0).powi(8)
+}*/
+
+#[inline(always)]
+fn prob_combine(prob: f64, cnt: f64, weight: f64, assumed: f64) -> f64 {
+    ((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt))
+}
--- a/crates/nlp/src/bayes/mod.rs
+++ b/crates/nlp/src/bayes/mod.rs
@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::{collections::HashMap, hash::BuildHasherDefault};
+
+use nohash::NoHashHasher;
+use serde::{Deserialize, Serialize};
+
+pub mod bloom;
+pub mod classify;
+pub mod train;
+
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub struct BayesModel {
+    pub weights: HashMap<TokenHash, Weights, BuildHasherDefault<NoHashHasher<TokenHash>>>,
+    pub spam_learns: u32,
+    pub ham_learns: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct BayesClassifier {
+    pub min_token_hits: u32,
+    pub min_tokens: u32,
+    pub min_prob_strength: f64,
+    pub min_learns: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone, PartialEq, Eq)]
+pub struct TokenHash {
+    h1: u64,
+    h2: u64,
+}
+
+#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone)]
+pub struct Weights {
+    spam: u32,
+    ham: u32,
+}
+
+impl BayesClassifier {
+    pub fn new() -> Self {
+        BayesClassifier {
+            min_token_hits: 2,
+            min_tokens: 11,
+            min_prob_strength: 0.05,
+            min_learns: 200,
+        }
+    }
+}
+
+impl Default for BayesClassifier {
+    fn default() -> Self {
+        Self::new()
+    }
+}
--- a/crates/nlp/src/bayes/train.rs
+++ b/crates/nlp/src/bayes/train.rs
@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use crate::transformers::osb::OsbToken;
+
+use super::{BayesModel, TokenHash};
+
+impl BayesModel {
+    pub fn train<T>(&mut self, tokens: T, is_spam: bool)
+    where
+        T: IntoIterator<Item = OsbToken<TokenHash>>,
+    {
+        if is_spam {
+            self.spam_learns += 1;
+        } else {
+            self.ham_learns += 1;
+        }
+
+        for token in tokens {
+            let hs = self.weights.entry(token.inner).or_default();
+            if is_spam {
+                hs.spam += 1;
+            } else {
+                hs.ham += 1;
+            }
+        }
+    }
+
+    pub fn untrain<T>(&mut self, tokens: T, is_spam: bool)
+    where
+        T: IntoIterator<Item = OsbToken<TokenHash>>,
+    {
+        if is_spam {
+            self.spam_learns -= 1;
+        } else {
+            self.ham_learns -= 1;
+        }
+
+        for token in tokens {
+            let hs = self.weights.entry(token.inner).or_default();
+            if is_spam {
+                hs.spam -= 1;
+            } else {
+                hs.ham -= 1;
+            }
+        }
+    }
+}
--- a/crates/nlp/src/language/detect.rs
+++ b/crates/nlp/src/language/detect.rs
--- a/crates/nlp/src/language/mod.rs
+++ b/crates/nlp/src/language/mod.rs
@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::borrow::Cow;
+
+use crate::tokenizers::{
+    chinese::ChineseTokenizer, japanese::JapaneseTokenizer, word::WordTokenizer, Token,
+};
+
+use self::detect::LanguageDetector;
+
+pub mod detect;
+pub mod stemmer;
+
+pub type LanguageTokenizer<'x> = Box<dyn Iterator<Item = Token<Cow<'x, str>>> + 'x>;
+
+impl Language {
+    pub fn tokenize_text<'x>(
+        &self,
+        text: &'x str,
+        max_token_length: usize,
+    ) -> LanguageTokenizer<'x> {
+        match self {
+            Language::Japanese => Box::new(
+                JapaneseTokenizer::new(WordTokenizer::new(text, usize::MAX))
+                    .filter(move |t| t.word.len() <= max_token_length),
+            ),
+            Language::Mandarin => Box::new(
+                ChineseTokenizer::new(WordTokenizer::new(text, usize::MAX))
+                    .filter(move |t| t.word.len() <= max_token_length),
+            ),
+            _ => Box::new(WordTokenizer::new(text, max_token_length)),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
+pub enum Language {
+    Esperanto = 0,
+    English = 1,
+    Russian = 2,
+    Mandarin = 3,
+    Spanish = 4,
+    Portuguese = 5,
+    Italian = 6,
+    Bengali = 7,
+    French = 8,
+    German = 9,
+    Ukrainian = 10,
+    Georgian = 11,
+    Arabic = 12,
+    Hindi = 13,
+    Japanese = 14,
+    Hebrew = 15,
+    Yiddish = 16,
+    Polish = 17,
+    Amharic = 18,
+    Javanese = 19,
+    Korean = 20,
+    Bokmal = 21,
+    Danish = 22,
+    Swedish = 23,
+    Finnish = 24,
+    Turkish = 25,
+    Dutch = 26,
+    Hungarian = 27,
+    Czech = 28,
+    Greek = 29,
+    Bulgarian = 30,
+    Belarusian = 31,
+    Marathi = 32,
+    Kannada = 33,
+    Romanian = 34,
+    Slovene = 35,
+    Croatian = 36,
+    Serbian = 37,
+    Macedonian = 38,
+    Lithuanian = 39,
+    Latvian = 40,
+    Estonian = 41,
+    Tamil = 42,
+    Vietnamese = 43,
+    Urdu = 44,
+    Thai = 45,
+    Gujarati = 46,
+    Uzbek = 47,
+    Punjabi = 48,
+    Azerbaijani = 49,
+    Indonesian = 50,
+    Telugu = 51,
+    Persian = 52,
+    Malayalam = 53,
+    Oriya = 54,
+    Burmese = 55,
+    Nepali = 56,
+    Sinhalese = 57,
+    Khmer = 58,
+    Turkmen = 59,
+    Akan = 60,
+    Zulu = 61,
+    Shona = 62,
+    Afrikaans = 63,
+    Latin = 64,
+    Slovak = 65,
+    Catalan = 66,
+    Tagalog = 67,
+    Armenian = 68,
+    Unknown = 69,
+    None = 70,
+}
+
+impl Language {
+    pub fn from_iso_639(code: &str) -> Option<Self> {
+        match code.split_once('-').map(|c| c.0).unwrap_or(code) {
+            "en" => Language::English,
+            "es" => Language::Spanish,
+            "pt" => Language::Portuguese,
+            "it" => Language::Italian,
+            "fr" => Language::French,
+            "de" => Language::German,
+            "ru" => Language::Russian,
+            "zh" => Language::Mandarin,
+            "ja" => Language::Japanese,
+            "ar" => Language::Arabic,
+            "hi" => Language::Hindi,
+            "ko" => Language::Korean,
+            "bn" => Language::Bengali,
+            "he" => Language::Hebrew,
+            "ur" => Language::Urdu,
+            "fa" => Language::Persian,
+            "ml" => Language::Malayalam,
+            "or" => Language::Oriya,
+            "my" => Language::Burmese,
+            "ne" => Language::Nepali,
+            "si" => Language::Sinhalese,
+            "km" => Language::Khmer,
+            "tk" => Language::Turkmen,
+            "am" => Language::Amharic,
+            "az" => Language::Azerbaijani,
+            "id" => Language::Indonesian,
+            "te" => Language::Telugu,
+            "ta" => Language::Tamil,
+            "vi" => Language::Vietnamese,
+            "gu" => Language::Gujarati,
+            "pa" => Language::Punjabi,
+            "uz" => Language::Uzbek,
+            "hy" => Language::Armenian,
+            "ka" => Language::Georgian,
+            "la" => Language::Latin,
+            "sl" => Language::Slovene,
+            "hr" => Language::Croatian,
+            "sr" => Language::Serbian,
+            "mk" => Language::Macedonian,
+            "lt" => Language::Lithuanian,
+            "lv" => Language::Latvian,
+            "et" => Language::Estonian,
+            "tl" => Language::Tagalog,
+            "af" => Language::Afrikaans,
+            "zu" => Language::Zulu,
+            "sn" => Language::Shona,
+            "ak" => Language::Akan,
+            _ => return None,
+        }
+        .into()
+    }
+}
+
+impl Language {
+    pub fn detect(text: String, default: Language) -> (String, Language) {
+        if let Some((l, t)) = text
+            .split_once(':')
+            .and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
+        {
+            (t.to_string(), l)
+        } else {
+            let l = LanguageDetector::detect_single(&text)
+                .and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
+                .unwrap_or(default);
+            (text, l)
+        }
+    }
+}
--- a/crates/nlp/src/language/stemmer.rs
+++ b/crates/nlp/src/language/stemmer.rs
@ -25,25 +25,25 @@ use std::borrow::Cow;

 use rust_stemmers::Algorithm;

-use super::{tokenizers::Tokenizer, Language};
+use super::{Language, LanguageTokenizer};

 #[derive(Debug, PartialEq, Eq)]
 pub struct StemmedToken<'x> {
    pub word: Cow<'x, str>,
    pub stemmed_word: Option<Cow<'x, str>>,
-    pub offset: u32, // Word offset in the text part
-    pub len: u8,     // Word length
+    pub from: usize, // Word offset in the text part
+    pub to: usize,   // Word length
 }

 pub struct Stemmer<'x> {
    stemmer: Option<rust_stemmers::Stemmer>,
-    tokenizer: Tokenizer<'x>,
+    tokenizer: LanguageTokenizer<'x>,
 }

 impl<'x> Stemmer<'x> {
    pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Stemmer<'x> {
        Stemmer {
-            tokenizer: Tokenizer::new(text, language, max_token_length),
+            tokenizer: language.tokenize_text(text, max_token_length),
            stemmer: STEMMER_MAP[language as usize].map(rust_stemmers::Stemmer::create),
        }
    }
@ -57,15 +57,15 @@ impl<'x> Iterator for Stemmer<'x> {
        Some(StemmedToken {
            stemmed_word: self.stemmer.as_ref().and_then(|stemmer| {
                match stemmer.stem(&token.word) {
-                    Cow::Owned(text) if text.len() != token.len as usize || text != token.word => {
+                    Cow::Owned(text) if text.len() != token.word.len() || text != token.word => {
                        Some(text.into())
                    }
                    _ => None,
                }
            }),
            word: token.word,
-            offset: token.offset,
-            len: token.len,
+            from: token.from,
+            to: token.to,
        })
    }
 }
--- a/crates/nlp/src/lib.rs
+++ b/crates/nlp/src/lib.rs
@ -0,0 +1,78 @@
+use ahash::AHashSet;
+
+pub mod bayes;
+pub mod language;
+pub mod tokenizers;
+pub mod transformers;
+
+#[derive(Debug, Clone, Default)]
+pub struct PublicSuffix {
+    pub suffixes: AHashSet<String>,
+    pub exceptions: AHashSet<String>,
+    pub wildcards: Vec<String>,
+}
+
+impl PublicSuffix {
+    pub fn contains(&self, suffix: &str) -> bool {
+        self.suffixes.contains(suffix)
+            || (!self.exceptions.contains(suffix)
+                && self.wildcards.iter().any(|w| suffix.ends_with(w)))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::fs;
+
+    use crate::{
+        bayes::{bloom::BloomHasher, BayesClassifier, BayesModel},
+        transformers::osb::{OsbToken, OsbTokenizer},
+    };
+
+    #[test]
+    #[ignore]
+    fn train() {
+        let db = fs::read_to_string("spam_or_not_spam.csv").unwrap();
+        let mut bayes = BayesModel::default();
+
+        for line in db.lines() {
+            let (text, is_spam) = line.rsplit_once(',').unwrap();
+            let is_spam = is_spam == "1";
+
+            bayes.train(
+                BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)),
+                is_spam,
+            );
+        }
+        println!("Ham: {} Spam: {}", bayes.ham_learns, bayes.spam_learns,);
+        fs::write("spam_or_not_spam.bin", bincode::serialize(&bayes).unwrap()).unwrap();
+    }
+
+    #[test]
+    #[ignore]
+    fn classify() {
+        let model: BayesModel =
+            bincode::deserialize(&fs::read("spam_or_not_spam.bin").unwrap()).unwrap();
+        let bayes = BayesClassifier::new();
+
+        for text in [
+            "i am attaching to this email a presentation to integrate the spreadsheet into our server",
+            "buy this great product special offer sales",
+            "i m using simple dns from jhsoft we support only a few web sites and i d like to swap secondary services with someone in a similar position",
+            "viagra xenical vioxx zyban propecia we only offer the real viagra xenical ",
+        ] {
+            println!(
+                "{:?} -> {}",
+                text,
+                bayes
+                    .classify(BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)).filter_map(|x| model.weights.get(&x.inner).map(|w| {
+                        OsbToken {
+                            idx: x.idx,
+                            inner: *w,
+                        }
+                    })), model.ham_learns, model.spam_learns)
+                    .unwrap()
+            );
+        }
+    }
+}
--- a/crates/nlp/src/tokenizers/chinese.rs
+++ b/crates/nlp/src/tokenizers/chinese.rs
@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2023, Stalwart Labs Ltd.
+ *
+ * This file is part of Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::{borrow::Cow, vec::IntoIter};
+
+use jieba_rs::Jieba;
+
+use super::{InnerToken, Token};
+use lazy_static::lazy_static;
+
+lazy_static! {
+    static ref JIEBA: Jieba = Jieba::new();
+}
+
+pub struct ChineseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    tokenizer: T,
+    tokens: IntoIter<Token<I>>,
+    phantom: std::marker::PhantomData<&'x str>,
+}
+
+impl<'x, T, I> ChineseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    pub fn new(tokenizer: T) -> Self {
+        ChineseTokenizer {
+            tokenizer,
+            tokens: Vec::new().into_iter(),
+            phantom: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<'x, T, I> Iterator for ChineseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    type Item = Token<I>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some(token) = self.tokens.next() {
+                return Some(token);
+            } else {
+                let token = self.tokenizer.next()?;
+                if token.word.is_alphabetic_8bit() {
+                    let mut token_to = token.from;
+                    match token.word.unwrap_alphabetic() {
+                        Cow::Borrowed(word) => {
+                            self.tokens = JIEBA
+                                .cut(word, false)
+                                .into_iter()
+                                .map(|word| {
+                                    let token_from = token_to;
+                                    token_to += word.len();
+                                    Token {
+                                        word: I::new_alphabetic(word),
+                                        from: token_from,
+                                        to: token_to,
+                                    }
+                                })
+                                .collect::<Vec<_>>()
+                                .into_iter();
+                        }
+                        Cow::Owned(word) => {
+                            self.tokens = JIEBA
+                                .cut(&word, false)
+                                .into_iter()
+                                .map(|word| {
+                                    let token_from = token_to;
+                                    token_to += word.len();
+                                    Token {
+                                        word: I::new_alphabetic(word.to_string()),
+                                        from: token_from,
+                                        to: token_to,
+                                    }
+                                })
+                                .collect::<Vec<_>>()
+                                .into_iter();
+                        }
+                    }
+                } else {
+                    return token.into();
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::tokenizers::{chinese::ChineseTokenizer, word::WordTokenizer, Token};
+
+    #[test]
+    fn chinese_tokenizer() {
+        assert_eq!(
+            ChineseTokenizer::new(WordTokenizer::new(
+                "孫子曰：兵者，國之大事，死生之地，存亡之道，不可不察也。",
+                40
+            ),)
+            .collect::<Vec<_>>(),
+            vec![
+                Token {
+                    word: "孫".into(),
+                    from: 0,
+                    to: 3
+                },
+                Token {
+                    word: "子".into(),
+                    from: 3,
+                    to: 6
+                },
+                Token {
+                    word: "曰".into(),
+                    from: 6,
+                    to: 9
+                },
+                Token {
+                    word: "兵".into(),
+                    from: 12,
+                    to: 15
+                },
+                Token {
+                    word: "者".into(),
+                    from: 15,
+                    to: 18
+                },
+                Token {
+                    word: "國".into(),
+                    from: 21,
+                    to: 24
+                },
+                Token {
+                    word: "之".into(),
+                    from: 24,
+                    to: 27
+                },
+                Token {
+                    word: "大事".into(),
+                    from: 27,
+                    to: 33
+                },
+                Token {
+                    word: "死".into(),
+                    from: 36,
+                    to: 39
+                },
+                Token {
+                    word: "生".into(),
+                    from: 39,
+                    to: 42
+                },
+                Token {
+                    word: "之".into(),
+                    from: 42,
+                    to: 45
+                },
+                Token {
+                    word: "地".into(),
+                    from: 45,
+                    to: 48
+                },
+                Token {
+                    word: "存亡".into(),
+                    from: 51,
+                    to: 57
+                },
+                Token {
+                    word: "之".into(),
+                    from: 57,
+                    to: 60
+                },
+                Token {
+                    word: "道".into(),
+                    from: 60,
+                    to: 63
+                },
+                Token {
+                    word: "不可不".into(),
+                    from: 66,
+                    to: 75
+                },
+                Token {
+                    word: "察".into(),
+                    from: 75,
+                    to: 78
+                },
+                Token {
+                    word: "也".into(),
+                    from: 78,
+                    to: 81
+                }
+            ]
+        );
+    }
+}
--- a/crates/nlp/src/tokenizers/japanese.rs
+++ b/crates/nlp/src/tokenizers/japanese.rs
@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2023, Stalwart Labs Ltd.
+ *
+ * This file is part of Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::vec::IntoIter;
+
+use super::{InnerToken, Token};
+
+pub struct JapaneseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    tokenizer: T,
+    tokens: IntoIter<Token<I>>,
+    phantom: std::marker::PhantomData<&'x str>,
+}
+
+impl<'x, T, I> JapaneseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    pub fn new(tokenizer: T) -> Self {
+        JapaneseTokenizer {
+            tokenizer,
+            tokens: Vec::new().into_iter(),
+            phantom: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<'x, T, I> Iterator for JapaneseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    type Item = Token<I>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some(token) = self.tokens.next() {
+                return Some(token);
+            } else {
+                let token = self.tokenizer.next()?;
+                if token.word.is_alphabetic_8bit() {
+                    let mut token_to = token.from;
+                    self.tokens = tinysegmenter::tokenize(token.word.unwrap_alphabetic().as_ref())
+                        .into_iter()
+                        .map(|word| {
+                            let token_from = token_to;
+                            token_to += word.len();
+                            Token {
+                                word: I::new_alphabetic(word.to_string()),
+                                from: token_from,
+                                to: token_to,
+                            }
+                        })
+                        .collect::<Vec<_>>()
+                        .into_iter();
+                } else {
+                    return token.into();
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::tokenizers::{japanese::JapaneseTokenizer, word::WordTokenizer, Token};
+
+    #[test]
+    fn japanese_tokenizer() {
+        assert_eq!(
+            JapaneseTokenizer::new(WordTokenizer::new(
+                "お先に失礼します あなたの名前は何ですか 123 abc-872",
+                40
+            ))
+            .collect::<Vec<_>>(),
+            vec![
+                Token {
+                    word: "お先".into(),
+                    from: 0,
+                    to: 6
+                },
+                Token {
+                    word: "に".into(),
+                    from: 6,
+                    to: 9
+                },
+                Token {
+                    word: "失礼".into(),
+                    from: 9,
+                    to: 15
+                },
+                Token {
+                    word: "し".into(),
+                    from: 15,
+                    to: 18
+                },
+                Token {
+                    word: "ます".into(),
+                    from: 18,
+                    to: 24
+                },
+                Token {
+                    word: "あなた".into(),
+                    from: 25,
+                    to: 34
+                },
+                Token {
+                    word: "の".into(),
+                    from: 34,
+                    to: 37
+                },
+                Token {
+                    word: "名前".into(),
+                    from: 37,
+                    to: 43
+                },
+                Token {
+                    word: "は".into(),
+                    from: 43,
+                    to: 46
+                },
+                Token {
+                    word: "何".into(),
+                    from: 46,
+                    to: 49
+                },
+                Token {
+                    word: "です".into(),
+                    from: 49,
+                    to: 55
+                },
+                Token {
+                    word: "か".into(),
+                    from: 55,
+                    to: 58
+                },
+                Token {
+                    word: "123".into(),
+                    from: 59,
+                    to: 62
+                },
+                Token {
+                    word: "abc".into(),
+                    from: 63,
+                    to: 66
+                },
+                Token {
+                    word: "872".into(),
+                    from: 67,
+                    to: 70
+                }
+            ]
+        );
+    }
+}
--- a/crates/nlp/src/tokenizers/mod.rs
+++ b/crates/nlp/src/tokenizers/mod.rs
@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+pub mod chinese;
+pub mod japanese;
+pub mod space;
+pub mod types;
+pub mod word;
+
+use std::borrow::Cow;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Token<T> {
+    pub word: T,
+    pub from: usize,
+    pub to: usize,
+}
+
+pub trait InnerToken<'x>: Sized {
+    fn new_alphabetic(value: impl Into<Cow<'x, str>>) -> Self;
+    fn unwrap_alphabetic(self) -> Cow<'x, str>;
+    fn is_alphabetic(&self) -> bool;
+    fn is_alphabetic_8bit(&self) -> bool;
+}
+
+impl<'x> InnerToken<'x> for Cow<'x, str> {
+    fn new_alphabetic(value: impl Into<Cow<'x, str>>) -> Self {
+        value.into()
+    }
+
+    fn is_alphabetic(&self) -> bool {
+        true
+    }
+
+    fn is_alphabetic_8bit(&self) -> bool {
+        !self.chars().all(|c| c.is_ascii())
+    }
+
+    fn unwrap_alphabetic(self) -> Cow<'x, str> {
+        self
+    }
+}
+
+impl<T> Token<T> {
+    pub fn new(offset: usize, len: usize, word: T) -> Token<T> {
+        debug_assert!(offset <= u32::max_value() as usize);
+        debug_assert!(len <= u8::max_value() as usize);
+        Token {
+            from: offset,
+            to: offset + len,
+            word,
+        }
+    }
+}
--- a/crates/store/src/fts/tokenizers/space.rs
+++ b/crates/store/src/fts/tokenizers/space.rs
--- a/crates/nlp/src/tokenizers/types.rs
+++ b/crates/nlp/src/tokenizers/types.rs
--- a/crates/store/src/fts/tokenizers/indo_european.rs
+++ b/crates/store/src/fts/tokenizers/indo_european.rs
@ -21,19 +21,19 @@
 * for more details.
 */

-use std::str::CharIndices;
+use std::{borrow::Cow, str::CharIndices};

 use super::Token;

-pub struct IndoEuropeanTokenizer<'x> {
+pub struct WordTokenizer<'x> {
    max_token_length: usize,
    text: &'x str,
    iterator: CharIndices<'x>,
 }

-impl<'x> IndoEuropeanTokenizer<'x> {
-    pub fn new(text: &str, max_token_length: usize) -> IndoEuropeanTokenizer {
-        IndoEuropeanTokenizer {
+impl<'x> WordTokenizer<'x> {
+    pub fn new(text: &str, max_token_length: usize) -> WordTokenizer {
+        WordTokenizer {
            max_token_length,
            text,
            iterator: text.char_indices(),
@ -42,8 +42,8 @@ impl<'x> IndoEuropeanTokenizer<'x> {
 }

 /// Parses indo-european text into lowercase tokens.
-impl<'x> Iterator for IndoEuropeanTokenizer<'x> {
-    type Item = Token<'x>;
+impl<'x> Iterator for WordTokenizer<'x> {
+    type Item = Token<Cow<'x, str>>;

    fn next(&mut self) -> Option<Self::Item> {
        while let Some((token_start, ch)) = self.iterator.next() {
@ -159,7 +159,7 @@ mod tests {
        ];

        for (input, tokens) in inputs.iter() {
-            for (pos, token) in IndoEuropeanTokenizer::new(input, 40).enumerate() {
+            for (pos, token) in WordTokenizer::new(input, 40).enumerate() {
                assert_eq!(token, tokens[pos]);
            }
        }
--- a/crates/nlp/src/transformers/mod.rs
+++ b/crates/nlp/src/transformers/mod.rs
@ -21,41 +21,4 @@
 * for more details.
 */

-use std::borrow::Cow;
-
-use super::bloom::{BloomFilter, BloomHashGroup};
-
-pub trait ToNgrams: Sized {
-    fn new(items: usize) -> Self;
-    fn insert(&mut self, item: &str);
-    fn to_ngrams(tokens: &[Cow<'_, str>], n: usize) -> Self {
-        let mut filter = Self::new(tokens.len().saturating_sub(1));
-        for words in tokens.windows(n) {
-            filter.insert(&words.join(" "));
-        }
-        filter
-    }
-}
-
-impl ToNgrams for BloomFilter {
-    fn new(items: usize) -> Self {
-        BloomFilter::new(items)
-    }
-
-    fn insert(&mut self, item: &str) {
-        self.insert(&item.into())
-    }
-}
-
-impl ToNgrams for Vec<BloomHashGroup> {
-    fn new(items: usize) -> Self {
-        Vec::with_capacity(items)
-    }
-
-    fn insert(&mut self, item: &str) {
-        self.push(BloomHashGroup {
-            h1: item.into(),
-            h2: None,
-        })
-    }
-}
+pub mod osb;
--- a/crates/nlp/src/transformers/osb.rs
+++ b/crates/nlp/src/transformers/osb.rs
@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::iter::Peekable;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct OsbToken<T> {
+    pub inner: T,
+    pub idx: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Gram<'x> {
+    Uni { t1: &'x str },
+    Bi { t1: &'x str, t2: &'x str },
+}
+
+pub struct OsbTokenizer<'x, I>
+where
+    I: Iterator<Item = &'x str>,
+{
+    iter: Peekable<I>,
+    buf: Vec<Option<&'x str>>,
+    window_size: usize,
+    window_pos: usize,
+    window_idx: usize,
+}
+
+impl<'x, I> OsbTokenizer<'x, I>
+where
+    I: Iterator<Item = &'x str>,
+{
+    pub fn new(iter: I, window_size: usize) -> Self {
+        Self {
+            iter: iter.peekable(),
+            buf: vec![None; window_size],
+            window_pos: 0,
+            window_idx: 0,
+            window_size,
+        }
+    }
+}
+
+impl<'x, I> Iterator for OsbTokenizer<'x, I>
+where
+    I: Iterator<Item = &'x str>,
+{
+    type Item = OsbToken<Gram<'x>>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let end_pos = (self.window_pos + self.window_idx) % self.window_size;
+        if self.buf[end_pos].is_none() {
+            self.buf[end_pos] = self.iter.next();
+        }
+
+        let t1 = self.buf[self.window_pos % self.window_size]?;
+        let token = OsbToken {
+            inner: if self.window_idx != 0 {
+                Gram::Bi {
+                    t1,
+                    t2: self.buf[end_pos]?,
+                }
+            } else {
+                Gram::Uni { t1 }
+            },
+            idx: self.window_idx,
+        };
+
+        // Increment window
+        self.window_idx += 1;
+        if self.window_idx == self.window_size
+            || (self.iter.peek().is_none()
+                && self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none())
+        {
+            self.buf[self.window_pos % self.window_size] = None;
+            self.window_idx = 0;
+            self.window_pos += 1;
+        }
+
+        Some(token)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::transformers::osb::{Gram, OsbToken};
+
+    #[test]
+    fn osb_tokenizer() {
+        assert_eq!(
+            super::OsbTokenizer::new(
+                "The quick brown fox jumps over the lazy dog and the lazy cat"
+                    .split_ascii_whitespace(),
+                5
+            )
+            .collect::<Vec<_>>(),
+            vec![
+                OsbToken {
+                    inner: Gram::Uni { t1: "The" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "The",
+                        t2: "quick"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "The",
+                        t2: "brown"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "The",
+                        t2: "fox"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "The",
+                        t2: "jumps"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "quick" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "quick",
+                        t2: "brown"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "quick",
+                        t2: "fox"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "quick",
+                        t2: "jumps"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "quick",
+                        t2: "over"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "brown" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "brown",
+                        t2: "fox"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "brown",
+                        t2: "jumps"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "brown",
+                        t2: "over"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "brown",
+                        t2: "the"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "fox" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "fox",
+                        t2: "jumps"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "fox",
+                        t2: "over"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "fox",
+                        t2: "the"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "fox",
+                        t2: "lazy"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "jumps" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "jumps",
+                        t2: "over"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "jumps",
+                        t2: "the"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "jumps",
+                        t2: "lazy"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "jumps",
+                        t2: "dog"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "over" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "over",
+                        t2: "the"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "over",
+                        t2: "lazy"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "over",
+                        t2: "dog"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "over",
+                        t2: "and"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "the" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "lazy"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "dog"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "and"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "the"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "lazy" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "dog"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "and"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "the"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "lazy"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "dog" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "dog",
+                        t2: "and"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "dog",
+                        t2: "the"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "dog",
+                        t2: "lazy"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "dog",
+                        t2: "cat"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "and" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "and",
+                        t2: "the"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "and",
+                        t2: "lazy"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "and",
+                        t2: "cat"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "the" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "lazy"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "cat"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "lazy" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "cat"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "cat" },
+                    idx: 0
+                }
+            ]
+        );
+    }
+}
--- a/crates/store/Cargo.toml
+++ b/crates/store/Cargo.toml
@ -6,6 +6,7 @@ resolver = "2"

 [dependencies]
 utils = { path = "../utils" }
+nlp = { path = "../nlp" }
 maybe-async = { path = "../maybe-async" }
 rocksdb = { version = "0.20.1", optional = true }
 foundationdb = { version = "0.8.0", features = ["embedded-fdb-include"], optional = true }
@ -21,13 +22,9 @@ serde = { version = "1.0", features = ["derive"]}
 ahash = { version = "0.8.0", features = ["serde"] }
 bitpacking = "0.8.4"
 lazy_static = "1.4"
-whatlang = "0.16" # Language detection
-rust-stemmers = "1.2" # Stemmers
-tinysegmenter = "0.1" # Japanese tokenizer
-jieba-rs = "0.6" # Chinese stemmer
 xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
 farmhash = "1.1.5"
-siphasher = "0.3"
+siphasher = "1.0"
 parking_lot = "0.12.1"
 lru-cache = { version = "0.1.2", optional = true }
 num_cpus = { version = "1.15.0", optional = true }
--- a/crates/store/src/fts/bloom.rs
+++ b/crates/store/src/fts/bloom.rs
@ -27,13 +27,12 @@ use std::{
    hash::{Hash, Hasher},
 };

+use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
 use roaring::RoaringBitmap;
 use utils::codec::leb128::{Leb128Reader, Leb128Vec};

 use crate::{Deserialize, Error, Serialize};

-use super::{stemmer::StemmedToken, tokenizers::Token};
-
 pub struct BloomFilter {
    m: u64,
    b: RoaringBitmap,
@ -204,8 +203,8 @@ impl From<Cow<'_, str>> for BloomHash {
    }
 }

-impl From<Token<'_>> for BloomHashGroup {
-    fn from(t: Token<'_>) -> Self {
+impl From<Token<Cow<'_, str>>> for BloomHashGroup {
+    fn from(t: Token<Cow<'_, str>>) -> Self {
        Self {
            h1: BloomHash::hash(t.word.as_ref()),
            h2: None,
--- a/crates/store/src/fts/builder.rs
+++ b/crates/store/src/fts/builder.rs
@ -24,6 +24,14 @@
 use std::{borrow::Cow, collections::HashSet};

 use ahash::AHashSet;
+use nlp::{
+    language::{
+        detect::{LanguageDetector, MIN_LANGUAGE_SCORE},
+        stemmer::Stemmer,
+        Language,
+    },
+    tokenizers::{space::SpaceTokenizer, Token},
+};
 use utils::map::vec_map::VecMap;

 use crate::{
@ -32,13 +40,7 @@ use crate::{
    Serialize, HASH_EXACT, HASH_STEMMED,
 };

-use super::{
-    lang::{LanguageDetector, MIN_LANGUAGE_SCORE},
-    stemmer::Stemmer,
-    term_index::{TermIndexBuilder, TokenIndex},
-    tokenizers::{space::SpaceTokenizer, Token},
-    Language,
-};
+use super::term_index::{TermIndexBuilder, TokenIndex};

 pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize;
 pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1;
@ -138,8 +140,8 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
                ops.insert(Operation::hash(&token, HASH_EXACT, field, true));
                terms.push(term_index.add_token(Token {
                    word: token.into(),
-                    offset: 0,
-                    len: 0,
+                    from: 0,
+                    to: 0,
                }));
            }
            term_index.add_terms(field, 0, terms);
--- a/crates/store/src/fts/mod.rs
+++ b/crates/store/src/fts/mod.rs
@ -26,149 +26,13 @@ use crate::{
    BitmapKey, Serialize, BM_HASH,
 };

-use self::{bloom::hash_token, builder::MAX_TOKEN_MASK, lang::LanguageDetector};
+use self::{bloom::hash_token, builder::MAX_TOKEN_MASK};

-pub mod lang;
-//pub mod pdf;
 pub mod bloom;
 pub mod builder;
-pub mod ngram;
 pub mod query;
 pub mod search_snippet;
-pub mod stemmer;
 pub mod term_index;
-pub mod tokenizers;
-
-#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
-pub enum Language {
-    Esperanto = 0,
-    English = 1,
-    Russian = 2,
-    Mandarin = 3,
-    Spanish = 4,
-    Portuguese = 5,
-    Italian = 6,
-    Bengali = 7,
-    French = 8,
-    German = 9,
-    Ukrainian = 10,
-    Georgian = 11,
-    Arabic = 12,
-    Hindi = 13,
-    Japanese = 14,
-    Hebrew = 15,
-    Yiddish = 16,
-    Polish = 17,
-    Amharic = 18,
-    Javanese = 19,
-    Korean = 20,
-    Bokmal = 21,
-    Danish = 22,
-    Swedish = 23,
-    Finnish = 24,
-    Turkish = 25,
-    Dutch = 26,
-    Hungarian = 27,
-    Czech = 28,
-    Greek = 29,
-    Bulgarian = 30,
-    Belarusian = 31,
-    Marathi = 32,
-    Kannada = 33,
-    Romanian = 34,
-    Slovene = 35,
-    Croatian = 36,
-    Serbian = 37,
-    Macedonian = 38,
-    Lithuanian = 39,
-    Latvian = 40,
-    Estonian = 41,
-    Tamil = 42,
-    Vietnamese = 43,
-    Urdu = 44,
-    Thai = 45,
-    Gujarati = 46,
-    Uzbek = 47,
-    Punjabi = 48,
-    Azerbaijani = 49,
-    Indonesian = 50,
-    Telugu = 51,
-    Persian = 52,
-    Malayalam = 53,
-    Oriya = 54,
-    Burmese = 55,
-    Nepali = 56,
-    Sinhalese = 57,
-    Khmer = 58,
-    Turkmen = 59,
-    Akan = 60,
-    Zulu = 61,
-    Shona = 62,
-    Afrikaans = 63,
-    Latin = 64,
-    Slovak = 65,
-    Catalan = 66,
-    Tagalog = 67,
-    Armenian = 68,
-    Unknown = 69,
-    None = 70,
-}
-
-impl Language {
-    pub fn from_iso_639(code: &str) -> Option<Self> {
-        match code.split_once('-').map(|c| c.0).unwrap_or(code) {
-            "en" => Language::English,
-            "es" => Language::Spanish,
-            "pt" => Language::Portuguese,
-            "it" => Language::Italian,
-            "fr" => Language::French,
-            "de" => Language::German,
-            "ru" => Language::Russian,
-            "zh" => Language::Mandarin,
-            "ja" => Language::Japanese,
-            "ar" => Language::Arabic,
-            "hi" => Language::Hindi,
-            "ko" => Language::Korean,
-            "bn" => Language::Bengali,
-            "he" => Language::Hebrew,
-            "ur" => Language::Urdu,
-            "fa" => Language::Persian,
-            "ml" => Language::Malayalam,
-            "or" => Language::Oriya,
-            "my" => Language::Burmese,
-            "ne" => Language::Nepali,
-            "si" => Language::Sinhalese,
-            "km" => Language::Khmer,
-            "tk" => Language::Turkmen,
-            "am" => Language::Amharic,
-            "az" => Language::Azerbaijani,
-            "id" => Language::Indonesian,
-            "te" => Language::Telugu,
-            "ta" => Language::Tamil,
-            "vi" => Language::Vietnamese,
-            "gu" => Language::Gujarati,
-            "pa" => Language::Punjabi,
-            "uz" => Language::Uzbek,
-            "hy" => Language::Armenian,
-            "ka" => Language::Georgian,
-            "la" => Language::Latin,
-            "sl" => Language::Slovene,
-            "hr" => Language::Croatian,
-            "sr" => Language::Serbian,
-            "mk" => Language::Macedonian,
-            "lt" => Language::Lithuanian,
-            "lv" => Language::Latvian,
-            "et" => Language::Estonian,
-            "tl" => Language::Tagalog,
-            "af" => Language::Afrikaans,
-            "zu" => Language::Zulu,
-            "sn" => Language::Shona,
-            "ak" => Language::Akan,
-            _ => return None,
-        }
-        .into()
-    }
-}

 impl BitmapKey<Vec<u8>> {
    pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self {
@ -209,19 +73,3 @@ impl Operation {
        }
    }
 }
-
-impl Language {
-    pub fn detect(text: String, default: Language) -> (String, Language) {
-        if let Some((l, t)) = text
-            .split_once(':')
-            .and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
-        {
-            (t.to_string(), l)
-        } else {
-            let l = LanguageDetector::detect_single(&text)
-                .and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
-                .unwrap_or(default);
-            (text, l)
-        }
-    }
-}
--- a/crates/store/src/fts/query.rs
+++ b/crates/store/src/fts/query.rs
@ -21,14 +21,14 @@
 * for more details.
 */

+use nlp::language::{stemmer::Stemmer, Language};
 use roaring::RoaringBitmap;

 use crate::{
-    fts::{builder::MAX_TOKEN_LENGTH, stemmer::Stemmer, tokenizers::Tokenizer},
-    BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
+    fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
 };

-use super::{term_index::TermIndex, Language};
+use super::term_index::TermIndex;

 impl ReadTransaction<'_> {
    #[maybe_async::maybe_async]
@ -44,7 +44,7 @@ impl ReadTransaction<'_> {
        if match_phrase {
            let mut phrase = Vec::new();
            let mut bit_keys = Vec::new();
-            for token in Tokenizer::new(text, language, MAX_TOKEN_LENGTH) {
+            for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) {
                let key = BitmapKey::hash(
                    token.word.as_ref(),
                    account_id,
--- a/crates/store/src/fts/search_snippet.rs
+++ b/crates/store/src/fts/search_snippet.rs
@ -134,12 +134,10 @@ pub fn generate_snippet(terms: &[Term], text: &str) -> Option<String> {
 #[cfg(test)]
 mod tests {

+    use nlp::language::Language;
+
    use crate::{
-        fts::{
-            term_index::{TermIndex, TermIndexBuilder},
-            tokenizers::Tokenizer,
-            Language,
-        },
+        fts::term_index::{TermIndex, TermIndexBuilder},
        Deserialize, Serialize,
    };

@ -242,7 +240,7 @@ mod tests {

            for (field_num, part) in parts.iter().enumerate() {
                let mut terms = Vec::new();
-                for token in Tokenizer::new(part, Language::English, 40) {
+                for token in Language::English.tokenize_text(part, 40) {
                    terms.push(builder.add_token(token));
                }
                builder.add_terms(field_num as u8, 0, terms);
--- a/crates/store/src/fts/term_index.rs
+++ b/crates/store/src/fts/term_index.rs
@ -21,14 +21,13 @@
 * for more details.
 */

-use std::convert::TryInto;
+use std::{borrow::Cow, convert::TryInto};

 use crate::{Deserialize, Serialize};

-use super::{stemmer::StemmedToken, tokenizers::Token};
-
 use ahash::{AHashMap, AHashSet};
 use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
+use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
 use utils::codec::leb128::{Leb128Reader, Leb128Vec};

 #[derive(Debug)]
@ -227,7 +226,7 @@ impl TermIndexBuilder {
        }
    }

-    pub fn add_token(&mut self, token: Token) -> Term {
+    pub fn add_token(&mut self, token: Token<Cow<str>>) -> Term {
        let id = self.terms.len() as u32;
        let id = self
            .terms
@ -236,8 +235,8 @@ impl TermIndexBuilder {
        Term {
            id: *id,
            id_stemmed: *id,
-            offset: token.offset,
-            len: token.len,
+            offset: token.from as u32,
+            len: (token.to - token.from) as u8,
        }
    }

@ -259,8 +258,8 @@ impl TermIndexBuilder {
        Term {
            id,
            id_stemmed,
-            offset: token.offset,
-            len: token.len,
+            offset: token.from as u32,
+            len: (token.to - token.from) as u8,
        }
    }

@ -775,13 +774,10 @@ impl TokenIndex {
 mod tests {

    use ahash::AHashMap;
+    use nlp::language::{stemmer::Stemmer, Language};

    use crate::{
-        fts::{
-            stemmer::Stemmer,
-            term_index::{TermIndexBuilder, TokenIndex},
-            Language,
-        },
+        fts::term_index::{TermIndexBuilder, TokenIndex},
        Deserialize, Serialize,
    };

--- a/crates/store/src/fts/tokenizers/chinese.rs
+++ b/crates/store/src/fts/tokenizers/chinese.rs
@ -1,197 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::{borrow::Cow, vec::IntoIter};
-
-use jieba_rs::Jieba;
-
-use super::{word::WordTokenizer, Token};
-use lazy_static::lazy_static;
-
-lazy_static! {
-    static ref JIEBA: Jieba = Jieba::new();
-}
-
-pub struct ChineseTokenizer<'x> {
-    word_tokenizer: WordTokenizer<'x>,
-    tokens: IntoIter<&'x str>,
-    token_offset: usize,
-    token_len: usize,
-    token_len_cur: usize,
-    max_token_length: usize,
-}
-
-impl<'x> ChineseTokenizer<'x> {
-    pub fn new(text: &str, max_token_length: usize) -> ChineseTokenizer {
-        ChineseTokenizer {
-            word_tokenizer: WordTokenizer::new(text),
-            tokens: Vec::new().into_iter(),
-            max_token_length,
-            token_offset: 0,
-            token_len: 0,
-            token_len_cur: 0,
-        }
-    }
-}
-
-impl<'x> Iterator for ChineseTokenizer<'x> {
-    type Item = Token<'x>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            if let Some(ch_token) = self.tokens.next() {
-                let offset_start = self.token_offset + self.token_len_cur;
-                self.token_len_cur += ch_token.len();
-
-                if ch_token.len() <= self.max_token_length {
-                    return Token::new(offset_start, ch_token.len(), ch_token.into()).into();
-                }
-            } else {
-                loop {
-                    let (token, is_ascii) = self.word_tokenizer.next()?;
-                    if !is_ascii {
-                        let word = match token.word {
-                            Cow::Borrowed(word) => word,
-                            Cow::Owned(_) => unreachable!(),
-                        };
-                        self.tokens = JIEBA.cut(word, false).into_iter();
-                        self.token_offset = token.offset as usize;
-                        self.token_len = token.len as usize;
-                        self.token_len_cur = 0;
-                        break;
-                    } else if token.len as usize <= self.max_token_length {
-                        return token.into();
-                    }
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn chinese_tokenizer() {
-        assert_eq!(
-            ChineseTokenizer::new(
-                "孫子曰：兵者，國之大事，死生之地，存亡之道，不可不察也。",
-                40
-            )
-            .collect::<Vec<_>>(),
-            vec![
-                Token {
-                    word: "孫".into(),
-                    offset: 0,
-                    len: 3
-                },
-                Token {
-                    word: "子".into(),
-                    offset: 3,
-                    len: 3
-                },
-                Token {
-                    word: "曰".into(),
-                    offset: 6,
-                    len: 3
-                },
-                Token {
-                    word: "兵".into(),
-                    offset: 12,
-                    len: 3
-                },
-                Token {
-                    word: "者".into(),
-                    offset: 15,
-                    len: 3
-                },
-                Token {
-                    word: "國".into(),
-                    offset: 21,
-                    len: 3
-                },
-                Token {
-                    word: "之".into(),
-                    offset: 24,
-                    len: 3
-                },
-                Token {
-                    word: "大事".into(),
-                    offset: 27,
-                    len: 6
-                },
-                Token {
-                    word: "死".into(),
-                    offset: 36,
-                    len: 3
-                },
-                Token {
-                    word: "生".into(),
-                    offset: 39,
-                    len: 3
-                },
-                Token {
-                    word: "之".into(),
-                    offset: 42,
-                    len: 3
-                },
-                Token {
-                    word: "地".into(),
-                    offset: 45,
-                    len: 3
-                },
-                Token {
-                    word: "存亡".into(),
-                    offset: 51,
-                    len: 6
-                },
-                Token {
-                    word: "之".into(),
-                    offset: 57,
-                    len: 3
-                },
-                Token {
-                    word: "道".into(),
-                    offset: 60,
-                    len: 3
-                },
-                Token {
-                    word: "不可不".into(),
-                    offset: 66,
-                    len: 9
-                },
-                Token {
-                    word: "察".into(),
-                    offset: 75,
-                    len: 3
-                },
-                Token {
-                    word: "也".into(),
-                    offset: 78,
-                    len: 3
-                }
-            ]
-        );
-    }
-}
--- a/crates/store/src/fts/tokenizers/japanese.rs
+++ b/crates/store/src/fts/tokenizers/japanese.rs
@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::vec::IntoIter;
-
-use super::{word::WordTokenizer, Token};
-
-pub struct JapaneseTokenizer<'x> {
-    word_tokenizer: WordTokenizer<'x>,
-    tokens: IntoIter<String>,
-    token_offset: usize,
-    token_len: usize,
-    token_len_cur: usize,
-    max_token_length: usize,
-}
-
-impl<'x> JapaneseTokenizer<'x> {
-    pub fn new(text: &str, max_token_length: usize) -> JapaneseTokenizer {
-        JapaneseTokenizer {
-            word_tokenizer: WordTokenizer::new(text),
-            tokens: Vec::new().into_iter(),
-            max_token_length,
-            token_offset: 0,
-            token_len: 0,
-            token_len_cur: 0,
-        }
-    }
-}
-
-impl<'x> Iterator for JapaneseTokenizer<'x> {
-    type Item = Token<'x>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            if let Some(jp_token) = self.tokens.next() {
-                let offset_start = self.token_offset + self.token_len_cur;
-                self.token_len_cur += jp_token.len();
-
-                if jp_token.len() <= self.max_token_length {
-                    return Token::new(offset_start, jp_token.len(), jp_token.into()).into();
-                }
-            } else {
-                loop {
-                    let (token, is_ascii) = self.word_tokenizer.next()?;
-                    if !is_ascii {
-                        self.tokens = tinysegmenter::tokenize(token.word.as_ref()).into_iter();
-                        self.token_offset = token.offset as usize;
-                        self.token_len = token.len as usize;
-                        self.token_len_cur = 0;
-                        break;
-                    } else if token.len as usize <= self.max_token_length {
-                        return token.into();
-                    }
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn japanese_tokenizer() {
-        assert_eq!(
-            JapaneseTokenizer::new("お先に失礼します あなたの名前は何ですか 123 abc-872", 40)
-                .collect::<Vec<_>>(),
-            vec![
-                Token {
-                    word: "お先".into(),
-                    offset: 0,
-                    len: 6
-                },
-                Token {
-                    word: "に".into(),
-                    offset: 6,
-                    len: 3
-                },
-                Token {
-                    word: "失礼".into(),
-                    offset: 9,
-                    len: 6
-                },
-                Token {
-                    word: "し".into(),
-                    offset: 15,
-                    len: 3
-                },
-                Token {
-                    word: "ます".into(),
-                    offset: 18,
-                    len: 6
-                },
-                Token {
-                    word: "あなた".into(),
-                    offset: 25,
-                    len: 9
-                },
-                Token {
-                    word: "の".into(),
-                    offset: 34,
-                    len: 3
-                },
-                Token {
-                    word: "名前".into(),
-                    offset: 37,
-                    len: 6
-                },
-                Token {
-                    word: "は".into(),
-                    offset: 43,
-                    len: 3
-                },
-                Token {
-                    word: "何".into(),
-                    offset: 46,
-                    len: 3
-                },
-                Token {
-                    word: "です".into(),
-                    offset: 49,
-                    len: 6
-                },
-                Token {
-                    word: "か".into(),
-                    offset: 55,
-                    len: 3
-                },
-                Token {
-                    word: "123".into(),
-                    offset: 59,
-                    len: 3
-                },
-                Token {
-                    word: "abc".into(),
-                    offset: 63,
-                    len: 3
-                },
-                Token {
-                    word: "872".into(),
-                    offset: 67,
-                    len: 3
-                }
-            ]
-        );
-    }
-}
--- a/crates/store/src/fts/tokenizers/mod.rs
+++ b/crates/store/src/fts/tokenizers/mod.rs
@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-pub mod chinese;
-pub mod indo_european;
-pub mod japanese;
-pub mod space;
-pub mod word;
-
-use std::borrow::Cow;
-
-use self::{
-    chinese::ChineseTokenizer, indo_european::IndoEuropeanTokenizer, japanese::JapaneseTokenizer,
-};
-
-use super::Language;
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct Token<'x> {
-    pub word: Cow<'x, str>,
-    pub offset: u32, // Word offset in the text part
-    pub len: u8,     // Word length
-}
-
-impl<'x> Token<'x> {
-    pub fn new(offset: usize, len: usize, word: Cow<'x, str>) -> Token<'x> {
-        debug_assert!(offset <= u32::max_value() as usize);
-        debug_assert!(len <= u8::max_value() as usize);
-        Token {
-            offset: offset as u32,
-            len: len as u8,
-            word,
-        }
-    }
-}
-
-enum LanguageTokenizer<'x> {
-    IndoEuropean(IndoEuropeanTokenizer<'x>),
-    Japanese(JapaneseTokenizer<'x>),
-    Chinese(ChineseTokenizer<'x>),
-}
-
-pub struct Tokenizer<'x> {
-    tokenizer: LanguageTokenizer<'x>,
-}
-
-impl<'x> Tokenizer<'x> {
-    pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Self {
-        Tokenizer {
-            tokenizer: match language {
-                Language::Japanese => {
-                    LanguageTokenizer::Japanese(JapaneseTokenizer::new(text, max_token_length))
-                }
-                Language::Mandarin => {
-                    LanguageTokenizer::Chinese(ChineseTokenizer::new(text, max_token_length))
-                }
-                _ => LanguageTokenizer::IndoEuropean(IndoEuropeanTokenizer::new(
-                    text,
-                    max_token_length,
-                )),
-            },
-        }
-    }
-}
-
-impl<'x> Iterator for Tokenizer<'x> {
-    type Item = Token<'x>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        match &mut self.tokenizer {
-            LanguageTokenizer::IndoEuropean(tokenizer) => tokenizer.next(),
-            LanguageTokenizer::Chinese(tokenizer) => tokenizer.next(),
-            LanguageTokenizer::Japanese(tokenizer) => tokenizer.next(),
-        }
-    }
-}
--- a/crates/store/src/fts/tokenizers/word.rs
+++ b/crates/store/src/fts/tokenizers/word.rs
@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::str::CharIndices;
-
-use super::Token;
-
-pub struct WordTokenizer<'x> {
-    text: &'x str,
-    iterator: CharIndices<'x>,
-}
-
-impl<'x> WordTokenizer<'x> {
-    pub fn new(text: &str) -> WordTokenizer {
-        WordTokenizer {
-            text,
-            iterator: text.char_indices(),
-        }
-    }
-}
-
-/// Parses text into tokens, used by non-IndoEuropean tokenizers.
-impl<'x> Iterator for WordTokenizer<'x> {
-    type Item = (Token<'x>, bool);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let mut is_ascii = true;
-        while let Some((token_start, ch)) = self.iterator.next() {
-            if ch.is_alphanumeric() {
-                let token_end = (&mut self.iterator)
-                    .filter_map(|(pos, ch)| {
-                        if ch.is_alphanumeric() {
-                            if is_ascii && !ch.is_ascii() {
-                                is_ascii = false;
-                            }
-                            None
-                        } else {
-                            pos.into()
-                        }
-                    })
-                    .next()
-                    .unwrap_or(self.text.len());
-
-                let token_len = token_end - token_start;
-                if token_end > token_start {
-                    return (
-                        Token::new(
-                            token_start,
-                            token_len,
-                            self.text[token_start..token_end].into(),
-                        ),
-                        is_ascii,
-                    )
-                        .into();
-                }
-            }
-        }
-        None
-    }
-}
--- a/crates/store/src/query/filter.rs
+++ b/crates/store/src/query/filter.rs
@ -24,12 +24,10 @@
 use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};

 use ahash::HashSet;
+use nlp::tokenizers::space::SpaceTokenizer;
 use roaring::RoaringBitmap;

-use crate::{
-    fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
-    BitmapKey, ReadTransaction, Store,
-};
+use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, Store};

 use super::{Filter, ResultSet, TextMatch};

--- a/crates/store/src/query/mod.rs
+++ b/crates/store/src/query/mod.rs
@ -26,11 +26,10 @@ pub mod get;
 pub mod log;
 pub mod sort;

+use nlp::language::Language;
 use roaring::RoaringBitmap;

-use crate::{
-    fts::Language, write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS,
-};
+use crate::{write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS};

 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Operator {
--- a/crates/store/src/write/mod.rs
+++ b/crates/store/src/write/mod.rs
@ -23,11 +23,11 @@

 use std::{collections::HashSet, slice::Iter, time::SystemTime};

+use nlp::tokenizers::space::SpaceTokenizer;
 use utils::codec::leb128::{Leb128Iterator, Leb128Vec};

 use crate::{
-    fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
-    Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
+    fts::builder::MAX_TOKEN_LENGTH, Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
 };

 use self::assert::AssertValue;
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@ -12,6 +12,7 @@ foundationdb = ["store/foundation"]

 [dependencies]
 store = { path = "../crates/store", features = ["test_mode"] }
+nlp = { path = "../crates/nlp" }
 directory = { path = "../crates/directory" }
 jmap = { path = "../crates/jmap", features = ["test_mode"] }
 jmap_proto = { path = "../crates/jmap-proto" }
--- a/tests/src/store/query.rs
+++ b/tests/src/store/query.rs
@ -27,10 +27,11 @@ use std::{
 };

 use jmap_proto::types::keyword::Keyword;
+use nlp::language::Language;
 use store::{ahash::AHashMap, query::sort::Pagination};

 use store::{
-    fts::{builder::FtsIndexBuilder, Language},
+    fts::builder::FtsIndexBuilder,
    query::{Comparator, Filter},
    write::{BatchBuilder, F_BITMAP, F_INDEX, F_VALUE},
    Store, ValueKey,