From 3d9efd363a3ed1fab306e3f5e9fea41bfe58d8be Mon Sep 17 00:00:00 2001
From: mdecimus <mauro@stalw.art>
Date: Tue, 10 Oct 2023 18:58:38 +0200
Subject: [PATCH] Bayes classifier, type tokenizer and NLP module
 reorganization

---
 CHANGELOG.md                                  |    3 +-
 Cargo.lock                                    |  116 +-
 Cargo.toml                                    |    2 +-
 README.md                                     |    1 +
 crates/antispam/Cargo.toml                    |    7 -
 crates/antispam/src/main.rs                   |   64 -
 .../src/modules/antispam}/mod.rs              |    0
 .../src/modules/antispam}/spamassassin.rs     |    0
 .../src/modules/antispam}/tokenizer.rs        |    0
 .../src/modules/antispam}/utils.rs            |    0
 crates/imap/Cargo.toml                        |    1 +
 crates/imap/src/op/search.rs                  |    3 +-
 crates/jmap/Cargo.toml                        |    1 +
 crates/jmap/src/api/config.rs                 |    6 +-
 crates/jmap/src/email/index.rs                |    6 +-
 crates/jmap/src/email/query.rs                |    3 +-
 crates/jmap/src/email/snippet.rs              |    7 +-
 crates/jmap/src/lib.rs                        |    2 +-
 crates/jmap/src/mailbox/query.rs              |    2 +-
 crates/jmap/src/sieve/query.rs                |    6 +-
 crates/nlp/Cargo.toml                         |   19 +
 crates/nlp/src/bayes/bloom.rs                 |   77 +
 crates/nlp/src/bayes/classify.rs              |  167 +
 crates/nlp/src/bayes/mod.rs                   |   75 +
 crates/nlp/src/bayes/train.rs                 |   68 +
 .../lang.rs => nlp/src/language/detect.rs}    |    0
 crates/nlp/src/language/mod.rs                |  202 ++
 .../src/fts => nlp/src/language}/stemmer.rs   |   16 +-
 crates/nlp/src/lib.rs                         |   78 +
 crates/nlp/src/tokenizers/chinese.rs          |  222 ++
 crates/nlp/src/tokenizers/japanese.rs         |  179 +
 crates/nlp/src/tokenizers/mod.rs              |   74 +
 .../src/fts => nlp/src}/tokenizers/space.rs   |    0
 crates/nlp/src/tokenizers/types.rs            | 2878 +++++++++++++++++
 .../src/tokenizers/word.rs}                   |   16 +-
 .../ngram.rs => nlp/src/transformers/mod.rs}  |   39 +-
 crates/nlp/src/transformers/osb.rs            |  467 +++
 crates/store/Cargo.toml                       |    7 +-
 crates/store/src/fts/bloom.rs                 |    7 +-
 crates/store/src/fts/builder.rs               |   20 +-
 crates/store/src/fts/mod.rs                   |  154 +-
 crates/store/src/fts/query.rs                 |    8 +-
 crates/store/src/fts/search_snippet.rs        |   10 +-
 crates/store/src/fts/term_index.rs            |   22 +-
 crates/store/src/fts/tokenizers/chinese.rs    |  197 --
 crates/store/src/fts/tokenizers/japanese.rs   |  168 -
 crates/store/src/fts/tokenizers/mod.rs        |   96 -
 crates/store/src/fts/tokenizers/word.rs       |   80 -
 crates/store/src/query/filter.rs              |    6 +-
 crates/store/src/query/mod.rs                 |    5 +-
 crates/store/src/write/mod.rs                 |    4 +-
 tests/Cargo.toml                              |    1 +
 tests/src/store/query.rs                      |    3 +-
 53 files changed, 4651 insertions(+), 944 deletions(-)
 delete mode 100644 crates/antispam/Cargo.toml
 delete mode 100644 crates/antispam/src/main.rs
 rename crates/{antispam/src/import => cli/src/modules/antispam}/mod.rs (100%)
 rename crates/{antispam/src/import => cli/src/modules/antispam}/spamassassin.rs (100%)
 rename crates/{antispam/src/import => cli/src/modules/antispam}/tokenizer.rs (100%)
 rename crates/{antispam/src/import => cli/src/modules/antispam}/utils.rs (100%)
 create mode 100644 crates/nlp/Cargo.toml
 create mode 100644 crates/nlp/src/bayes/bloom.rs
 create mode 100644 crates/nlp/src/bayes/classify.rs
 create mode 100644 crates/nlp/src/bayes/mod.rs
 create mode 100644 crates/nlp/src/bayes/train.rs
 rename crates/{store/src/fts/lang.rs => nlp/src/language/detect.rs} (100%)
 create mode 100644 crates/nlp/src/language/mod.rs
 rename crates/{store/src/fts => nlp/src/language}/stemmer.rs (93%)
 create mode 100644 crates/nlp/src/lib.rs
 create mode 100644 crates/nlp/src/tokenizers/chinese.rs
 create mode 100644 crates/nlp/src/tokenizers/japanese.rs
 create mode 100644 crates/nlp/src/tokenizers/mod.rs
 rename crates/{store/src/fts => nlp/src}/tokenizers/space.rs (100%)
 create mode 100644 crates/nlp/src/tokenizers/types.rs
 rename crates/{store/src/fts/tokenizers/indo_european.rs => nlp/src/tokenizers/word.rs} (94%)
 rename crates/{store/src/fts/ngram.rs => nlp/src/transformers/mod.rs} (53%)
 create mode 100644 crates/nlp/src/transformers/osb.rs
 delete mode 100644 crates/store/src/fts/tokenizers/chinese.rs
 delete mode 100644 crates/store/src/fts/tokenizers/japanese.rs
 delete mode 100644 crates/store/src/fts/tokenizers/mod.rs
 delete mode 100644 crates/store/src/fts/tokenizers/word.rs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2453db6e..c20ad3de 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,8 @@ All notable changes to this project will be documented in this file. This projec
 ## [0.3.9] - 2023-10-07
 
 ## Added
-- Support for reading environment variables from configuration file using the `!ENV_VAR_NAME` special keyword.
+- Support for reading environment variables from the configuration file using the `!ENV_VAR_NAME` special keyword.
+- Option to disable ANSI color codes in logs.
 
 ### Changed
 - Querying directories from a Sieve script is now done using the `query()` method from `eval`. Your scripts will need to be updated, please refer to the [new syntax](https://stalw.art/docs/smtp/filter/sieve#directory-queries).
diff --git a/Cargo.lock b/Cargo.lock
index 4c57347e..f86c208d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -169,13 +169,6 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "antispam"
-version = "0.1.0"
-dependencies = [
- "fancy-regex",
-]
-
 [[package]]
 name = "anyhow"
 version = "1.0.75"
@@ -1487,25 +1480,14 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.4"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "add4f07d43996f76ef320709726a556a9d4f965d9410d8d0271132d2f8293480"
+checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
 dependencies = [
- "errno-dragonfly",
  "libc",
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
- "libc",
-]
-
 [[package]]
 name = "etcetera"
 version = "0.8.0"
@@ -2252,6 +2234,7 @@ dependencies = [
  "mail-parser",
  "mail-send",
  "md5",
+ "nlp",
  "parking_lot",
  "rustls 0.21.7",
  "rustls-pemfile",
@@ -2450,6 +2433,7 @@ dependencies = [
  "mail-parser",
  "mail-send",
  "mime",
+ "nlp",
  "p256",
  "rand 0.8.5",
  "rasn",
@@ -2510,9 +2494,9 @@ dependencies = [
 
 [[package]]
 name = "jobserver"
-version = "0.1.26"
+version = "0.1.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d"
 dependencies = [
  "libc",
 ]
@@ -2703,9 +2687,9 @@ dependencies = [
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.8"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db"
+checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
 
 [[package]]
 name = "lock_api"
@@ -2754,7 +2738,7 @@ dependencies = [
  "mail-parser",
  "parking_lot",
  "quick-xml 0.30.0",
- "ring 0.17.2",
+ "ring 0.17.3",
  "rustls-pemfile",
  "serde",
  "serde_json",
@@ -3001,6 +2985,30 @@ dependencies = [
  "pin-utils",
 ]
 
+[[package]]
+name = "nlp"
+version = "0.3.9"
+dependencies = [
+ "ahash 0.8.3",
+ "bincode",
+ "farmhash",
+ "jieba-rs",
+ "lazy_static",
+ "nohash",
+ "rust-stemmers",
+ "serde",
+ "siphasher 1.0.0",
+ "tinysegmenter",
+ "whatlang",
+ "xxhash-rust",
+]
+
+[[package]]
+name = "nohash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0f889fb66f7acdf83442c35775764b51fed3c606ab9cee51500dbde2cf528ca"
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -3072,9 +3080,9 @@ dependencies = [
 
 [[package]]
 name = "num-traits"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
 dependencies = [
  "autocfg",
  "libm",
@@ -3476,7 +3484,7 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
 dependencies = [
- "siphasher",
+ "siphasher 0.3.11",
 ]
 
 [[package]]
@@ -3485,7 +3493,7 @@ version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
 dependencies = [
- "siphasher",
+ "siphasher 0.3.11",
 ]
 
 [[package]]
@@ -3791,9 +3799,9 @@ dependencies = [
 
 [[package]]
 name = "rasn"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2cf5174961dbfd4f03b57e71e5a11b034f564d5f0b133d63e39d703ac3d2876b"
+checksum = "4addd1a49756bcb131c2f686c6c833d2b63e4da7a0df07efd8c3de04b7efbdb2"
 dependencies = [
  "arrayvec",
  "bitvec",
@@ -3813,9 +3821,9 @@ dependencies = [
 
 [[package]]
 name = "rasn-cms"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56517898cf38bb50fdb6479049ed476510bf59ae7d329b35129dc8a8b309697f"
+checksum = "e269b4df6eea0f54abd46afacd759b1c13a27e98da98a47ef3c405ef3568b0f5"
 dependencies = [
  "rasn",
  "rasn-pkix",
@@ -3823,9 +3831,9 @@ dependencies = [
 
 [[package]]
 name = "rasn-derive"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8def4ce07f970be91bad36c3090af419dcd9e696897ada3cf74bd480e0101d61"
+checksum = "ba8242a16e3461b81333516ad8457906f52fdf21d087417fb59262c9ab406618"
 dependencies = [
  "either",
  "itertools 0.10.5",
@@ -3838,9 +3846,9 @@ dependencies = [
 
 [[package]]
 name = "rasn-pkix"
-version = "0.10.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebdeef45b70d4c20ce34725707b2784c761eacaaa4d841eab46f9f9c6dc10dd3"
+checksum = "06179c947a63fe9f9f5d73a539dcb13d90c6bdaeb03bd28b90ad796aff9fe6a8"
 dependencies = [
  "rasn",
 ]
@@ -4024,9 +4032,9 @@ dependencies = [
 
 [[package]]
 name = "ring"
-version = "0.17.2"
+version = "0.17.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "911b295d2d302948838c8ac142da1ee09fa7863163b44e6715bc9357905878b8"
+checksum = "9babe80d5c16becf6594aa32ad2be8fe08498e7ae60b77de8df700e67f191d7e"
 dependencies = [
  "cc",
  "getrandom 0.2.10",
@@ -4198,9 +4206,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.17"
+version = "0.38.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7"
+checksum = "5a74ee2d7c2581cd139b42447d7d9389b889bdaad3a73f1ebb16f2a3237bb19c"
 dependencies = [
  "bitflags 2.4.0",
  "errno",
@@ -4644,6 +4652,12 @@ version = "0.3.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
 
+[[package]]
+name = "siphasher"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe"
+
 [[package]]
 name = "slab"
 version = "0.4.9"
@@ -5048,10 +5062,10 @@ dependencies = [
  "farmhash",
  "foundationdb",
  "futures",
- "jieba-rs",
  "lazy_static",
  "lru-cache",
  "maybe-async 0.2.7",
+ "nlp",
  "num_cpus",
  "parking_lot",
  "r2d2",
@@ -5061,14 +5075,11 @@ dependencies = [
  "rocksdb",
  "rusqlite",
  "rust-s3",
- "rust-stemmers",
  "serde",
- "siphasher",
- "tinysegmenter",
+ "siphasher 1.0.0",
  "tokio",
  "tracing",
  "utils",
- "whatlang",
  "xxhash-rust",
 ]
 
@@ -5244,6 +5255,7 @@ dependencies = [
  "mail-parser",
  "mail-send",
  "managesieve",
+ "nlp",
  "num_cpus",
  "rayon",
  "reqwest",
@@ -5358,9 +5370,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.32.0"
+version = "1.33.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9"
+checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653"
 dependencies = [
  "backtrace",
  "bytes",
@@ -6040,12 +6052,12 @@ dependencies = [
 
 [[package]]
 name = "webpki"
-version = "0.22.2"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
+checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53"
 dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
+ "ring 0.17.3",
+ "untrusted 0.9.0",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index b9dea6d6..36ca52e8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,9 +8,9 @@ members = [
     "crates/imap-proto",
     "crates/smtp",
     "crates/managesieve",
+    "crates/nlp",
     "crates/store",
     "crates/directory",
-    "crates/antispam",
     "crates/utils",
     "crates/maybe-async",
     "crates/cli",
diff --git a/README.md b/README.md
index 04bc9985..0574f2b9 100644
--- a/README.md
+++ b/README.md
@@ -38,6 +38,7 @@ Key features:
   - OAuth 2.0 [authorization code](https://www.rfc-editor.org/rfc/rfc8628) and [device authorization](https://www.rfc-editor.org/rfc/rfc8628) flows.
   - Access Control Lists (ACLs).
   - Rate limiting.
+  - Security audited (read the [report](https://stalw.art/blog/security-audit)).
 - **Robust and scalable**:
   - **FoundationDB** or **SQLite** database backends.
   - **S3-compatible** blob storage support.
diff --git a/crates/antispam/Cargo.toml b/crates/antispam/Cargo.toml
deleted file mode 100644
index 9bad0181..00000000
--- a/crates/antispam/Cargo.toml
+++ /dev/null
@@ -1,7 +0,0 @@
-[package]
-name = "antispam"
-version = "0.1.0"
-edition = "2021"
-
-[dependencies]
-fancy-regex = "0.11.0"
diff --git a/crates/antispam/src/main.rs b/crates/antispam/src/main.rs
deleted file mode 100644
index c83f7f01..00000000
--- a/crates/antispam/src/main.rs
+++ /dev/null
@@ -1,64 +0,0 @@
-use std::path::PathBuf;
-
-use import::spamassassin::import_spamassassin;
-
-pub mod import;
-
-fn main() {
-    import_spamassassin(
-        PathBuf::from("/Users/me/code/mail-server/resources/spamassassin"),
-        "cf".to_string(),
-        false,
-    );
-}
-
-const _IGNORE: &str = r#"
-
-[antispam]
-required-score = 5
-add-headers = ["X-Spam-Checker-Version: SpamAssassin _VERSION_ (_SUBVERSION_) on _HOSTNAME_",
- "X-Spam-Flag: _YESNOCAPS_", "X-Spam-Level: _STARS(*)_",
- "X-Spam-Status: _YESNO_, score=_SCORE_ required=_REQD_ tests=_TESTS_ autolearn=_AUTOLEARN_ version=_VERSION_"]
-originating-ip-headers = ["X-Yahoo-Post-IP", "X-Originating-IP", "X-Apparently-From",
- "X-SenderIP X-AOL-IP", "X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp"]
-rewrite-headers = ["Subject: [SPAM] _SUBJECT_"]
-redirect-patterns = ["""m'/(?:index.php)?\?.*(?<=[?&])URL=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/url\?.*?(?<=[?&])q=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:site|inurl):(.*?)(?:$|%20|[\s+&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:"|%22)(.*?)(?:$|%22|["\s+&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/translate\?.*?(?<=[?&])u=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/pagead/iclk\?.*?(?<=[?&])adurl=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:\w+\.)?aol\.com/redir\.adp\?.*(?<=[?&])_url=(.*?)(?:$|[&\#])'i""",
- """m'^https?/*(?:\w+\.)?facebook\.com/l/;(.*)'i""",
- """/^http:\/\/chkpt\.zdnet\.com\/chkpt\/\w+\/(.*)$/i""",
- """/^http:\/\/www(?:\d+)?\.nate\.com\/r\/\w+\/(.*)$/i""",
- """/^http:\/\/.+\.gov\/(?:.*\/)?externalLink\.jhtml\?.*url=(.*?)(?:&.*)?$/i""",
- """/^http:\/\/redir\.internet\.com\/.+?\/.+?\/(.*)$/i""",
- """/^http:\/\/(?:.*?\.)?adtech\.de\/.*(?:;|\|)link=(.*?)(?:;|$)/i""",
- """m'^http.*?/redirect\.php\?.*(?<=[?&])goto=(.*?)(?:$|[&\#])'i""",
- """m'^https?:/*(?:[^/]+\.)?emf\d\.com/r\.cfm.*?&r=(.*)'i"""
-]
-
-[antispam.autolearn]
-enable = true
-ignore-headers = [ "X-ACL-Warn", "X-Alimail-AntiSpam", "X-Amavis-Modified", "X-Anti*", "X-aol-global-disposition",
- "X-ASF-*", "X-Assp-Version", "X-Authority-Analysis", "X-Authvirus", "X-Auto-Response-Suppress", "X-AV-Do-Run",
- "X-AV-Status", "X-avast-antispam", "X-Backend", "X-Barracuda*", "X-Bayes*", "X-BitDefender*", "X-BL", "X-Bogosity",
- "X-Boxtrapper", "X-Brightmail-Tracker", "X-BTI-AntiSpam", "X-Bugzilla-Version", "X-CanIt*", "X-Clapf-spamicity",
- "X-Cloud-Security", "X-CM-Score", "X-CMAE-*", "X-Company", "X-Coremail-Antispam", "X-CRM114-*", "X-CT-Spam",
- "X-CTCH-*", "X-Drweb-SpamState", "X-DSPAM*", "X-eavas*", "X-Enigmail-Version", "X-Eset*", "X-Exchange-Antispam-Report",
- "X-ExtloopSabreCommercials1", "X-EYOU-SPAMVALUE", "X-FB-OUTBOUND-SPAM", "X-FEAS-SBL", "X-FILTER-SCORE", "X-Forefront*",
- "X-Fuglu*", "X-getmail-filter-classifier", "X-GFIME-MASPAM", "X-Gmane-NNTP-Posting-Host", "X-GMX-Anti*", "X-He-Spam",
- "X-hMailServer-Spam", "X-IAS", "X-iGspam-global", "X-Injected-Via-Gmane", "X-Interia-Antivirus", "X-IP-Spam-Verdict",
- "X-Ironport*", "X-Junk*", "X-KLMS-*", "X-KMail-*", "X-MailCleaner-*", "X-MailFoundry", "X-MDMailLookup-Result",
- "X-ME-*", "X-MessageFilter", "X-Microsoft-Antispam", "X-Mlf-Version", "X-MXScan-*", "X-NAI-Spam-*", "X-NetStation-Status",
- "X-OVH-SPAM*", "X-PerlMx-*", "X-PFSI-Info", "X-PMX-*", "X-Policy-Service", "X-policyd-weight", "X-PreRBLs",
- "X-Probable-Spam", "X-PROLinux-SpamCheck", "X-Proofpoint-*", "x-purgate-*", "X-Qmail-Scanner-*", "X-Quarantine-ID",
- "X-RSpam-Report", "X-SA-*", "X-Scanned-by", "X-SmarterMail-CustomSpamHeader", "X-Spam*", "X-SPF-Scan-By", "X-STA-*",
- "X-StarScan-Version", "X-SurGATE-Result", "X-SWITCHham-Score", "X-UI-*", "X-Univie*", "X-Virus*", "X-VR-*",
- "X-WatchGuard*", "X-Whitelist-Domain", "X-WUM-CCI", "X_CMAE_Category" ]
-threshold.ham = 0.1
-threshold.spam = 12.0
-
-
-"#;
diff --git a/crates/antispam/src/import/mod.rs b/crates/cli/src/modules/antispam/mod.rs
similarity index 100%
rename from crates/antispam/src/import/mod.rs
rename to crates/cli/src/modules/antispam/mod.rs
diff --git a/crates/antispam/src/import/spamassassin.rs b/crates/cli/src/modules/antispam/spamassassin.rs
similarity index 100%
rename from crates/antispam/src/import/spamassassin.rs
rename to crates/cli/src/modules/antispam/spamassassin.rs
diff --git a/crates/antispam/src/import/tokenizer.rs b/crates/cli/src/modules/antispam/tokenizer.rs
similarity index 100%
rename from crates/antispam/src/import/tokenizer.rs
rename to crates/cli/src/modules/antispam/tokenizer.rs
diff --git a/crates/antispam/src/import/utils.rs b/crates/cli/src/modules/antispam/utils.rs
similarity index 100%
rename from crates/antispam/src/import/utils.rs
rename to crates/cli/src/modules/antispam/utils.rs
diff --git a/crates/imap/Cargo.toml b/crates/imap/Cargo.toml
index 9ce4f25e..3e1a97f9 100644
--- a/crates/imap/Cargo.toml
+++ b/crates/imap/Cargo.toml
@@ -10,6 +10,7 @@ jmap = { path = "../jmap" }
 jmap_proto = { path = "../jmap-proto" }
 directory = { path = "../directory" }
 store = { path = "../store" }
+nlp = { path = "../nlp" }
 utils = { path = "../utils" }
 mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "ludicrous_mode"] } 
 mail-send = { git = "https://github.com/stalwartlabs/mail-send", default-features = false, features = ["cram-md5", "skip-ehlo"] }
diff --git a/crates/imap/src/op/search.rs b/crates/imap/src/op/search.rs
index fa67d208..32ff976d 100644
--- a/crates/imap/src/op/search.rs
+++ b/crates/imap/src/op/search.rs
@@ -34,8 +34,9 @@ use imap_proto::{
 
 use jmap_proto::types::{collection::Collection, id::Id, keyword::Keyword, property::Property};
 use mail_parser::HeaderName;
+use nlp::language::Language;
 use store::{
-    fts::{builder::MAX_TOKEN_LENGTH, Language},
+    fts::builder::MAX_TOKEN_LENGTH,
     query::{self, log::Query, sort::Pagination, ResultSet},
     roaring::RoaringBitmap,
     write::now,
diff --git a/crates/jmap/Cargo.toml b/crates/jmap/Cargo.toml
index 1ea05664..fa1de123 100644
--- a/crates/jmap/Cargo.toml
+++ b/crates/jmap/Cargo.toml
@@ -6,6 +6,7 @@ resolver = "2"
 
 [dependencies]
 store = { path = "../store" }
+nlp = { path = "../nlp" }
 jmap_proto = { path = "../jmap-proto" }
 smtp = { path =  "../smtp" }
 utils = { path =  "../utils" }
diff --git a/crates/jmap/src/api/config.rs b/crates/jmap/src/api/config.rs
index abed3120..d784a205 100644
--- a/crates/jmap/src/api/config.rs
+++ b/crates/jmap/src/api/config.rs
@@ -23,10 +23,8 @@
 
 use std::{str::FromStr, time::Duration};
 
-use store::{
-    fts::Language,
-    rand::{distributions::Alphanumeric, thread_rng, Rng},
-};
+use nlp::language::Language;
+use store::rand::{distributions::Alphanumeric, thread_rng, Rng};
 
 use super::session::BaseCapabilities;
 
diff --git a/crates/jmap/src/email/index.rs b/crates/jmap/src/email/index.rs
index f5170f1d..fcb5f826 100644
--- a/crates/jmap/src/email/index.rs
+++ b/crates/jmap/src/email/index.rs
@@ -37,11 +37,9 @@ use mail_parser::{
     parsers::{fields::thread::thread_name, preview::preview_text},
     Addr, Address, GetHeader, Group, HeaderName, HeaderValue, Message, MessagePart, PartType,
 };
+use nlp::language::Language;
 use store::{
-    fts::{
-        builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
-        Language,
-    },
+    fts::builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
     write::{BatchBuilder, IntoOperations, F_BITMAP, F_CLEAR, F_INDEX, F_VALUE},
 };
 
diff --git a/crates/jmap/src/email/query.rs b/crates/jmap/src/email/query.rs
index ac3b57e4..96d74eb5 100644
--- a/crates/jmap/src/email/query.rs
+++ b/crates/jmap/src/email/query.rs
@@ -28,8 +28,9 @@ use jmap_proto::{
     types::{acl::Acl, collection::Collection, keyword::Keyword, property::Property},
 };
 use mail_parser::HeaderName;
+use nlp::language::Language;
 use store::{
-    fts::{builder::MAX_TOKEN_LENGTH, Language},
+    fts::builder::MAX_TOKEN_LENGTH,
     query::{self},
     roaring::RoaringBitmap,
     ValueKey,
diff --git a/crates/jmap/src/email/snippet.rs b/crates/jmap/src/email/snippet.rs
index 0931b82c..6beb2e7c 100644
--- a/crates/jmap/src/email/snippet.rs
+++ b/crates/jmap/src/email/snippet.rs
@@ -30,14 +30,12 @@ use jmap_proto::{
     types::{acl::Acl, collection::Collection},
 };
 use mail_parser::{decoders::html::html_to_text, MessageParser, PartType};
+use nlp::language::{stemmer::Stemmer, Language};
 use store::{
     fts::{
         builder::MAX_TOKEN_LENGTH,
         search_snippet::generate_snippet,
-        stemmer::Stemmer,
         term_index::{self, TermIndex},
-        tokenizers::Tokenizer,
-        Language,
     },
     BlobKind,
 };
@@ -66,7 +64,8 @@ impl JMAP {
                             || (text.starts_with('\'') && text.ends_with('\''))
                         {
                             terms.push(
-                                Tokenizer::new(&text, language, MAX_TOKEN_LENGTH)
+                                language
+                                    .tokenize_text(&text, MAX_TOKEN_LENGTH)
                                     .map(|token| (token.word.into_owned(), None))
                                     .collect::<Vec<_>>(),
                             );
diff --git a/crates/jmap/src/lib.rs b/crates/jmap/src/lib.rs
index ae6c3176..6ca93c86 100644
--- a/crates/jmap/src/lib.rs
+++ b/crates/jmap/src/lib.rs
@@ -40,6 +40,7 @@ use jmap_proto::{
     },
     types::{collection::Collection, property::Property},
 };
+use nlp::language::Language;
 use services::{
     delivery::spawn_delivery_manager,
     housekeeper::{self, init_housekeeper, spawn_housekeeper},
@@ -47,7 +48,6 @@ use services::{
 };
 use smtp::core::SMTP;
 use store::{
-    fts::Language,
     parking_lot::Mutex,
     query::{sort::Pagination, Comparator, Filter, ResultSet, SortedResultSet},
     roaring::RoaringBitmap,
diff --git a/crates/jmap/src/mailbox/query.rs b/crates/jmap/src/mailbox/query.rs
index 84880059..9f1b81d2 100644
--- a/crates/jmap/src/mailbox/query.rs
+++ b/crates/jmap/src/mailbox/query.rs
@@ -27,9 +27,9 @@ use jmap_proto::{
     object::{mailbox::QueryArguments, Object},
     types::{acl::Acl, collection::Collection, property::Property, value::Value},
 };
+use nlp::language::Language;
 use store::{
     ahash::{AHashMap, AHashSet},
-    fts::Language,
     query::{self, sort::Pagination},
     roaring::RoaringBitmap,
 };
diff --git a/crates/jmap/src/sieve/query.rs b/crates/jmap/src/sieve/query.rs
index 033f0054..7f570160 100644
--- a/crates/jmap/src/sieve/query.rs
+++ b/crates/jmap/src/sieve/query.rs
@@ -28,10 +28,8 @@ use jmap_proto::{
     },
     types::{collection::Collection, property::Property},
 };
-use store::{
-    fts::Language,
-    query::{self},
-};
+use nlp::language::Language;
+use store::query::{self};
 
 use crate::JMAP;
 
diff --git a/crates/nlp/Cargo.toml b/crates/nlp/Cargo.toml
new file mode 100644
index 00000000..9db50841
--- /dev/null
+++ b/crates/nlp/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "nlp"
+version = "0.3.9"
+edition = "2021"
+resolver = "2"
+
+[dependencies]
+xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
+farmhash = "1.1.5"
+siphasher = "1.0"
+serde = { version = "1.0", features = ["derive"]}
+bincode = "1.3.3"
+nohash = "0.2.0"
+ahash = "0.8.3"
+lazy_static = "1.4"
+whatlang = "0.16" # Language detection
+rust-stemmers = "1.2" # Stemmers
+tinysegmenter = "0.1" # Japanese tokenizer
+jieba-rs = "0.6" # Chinese stemmer
diff --git a/crates/nlp/src/bayes/bloom.rs b/crates/nlp/src/bayes/bloom.rs
new file mode 100644
index 00000000..e701bcd6
--- /dev/null
+++ b/crates/nlp/src/bayes/bloom.rs
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use nohash::IsEnabled;
+
+use crate::transformers::osb::{Gram, OsbToken};
+
+use super::TokenHash;
+
+pub struct BloomHasher<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> {
+    buf: Vec<u8>,
+    tokens: T,
+}
+
+impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> BloomHasher<'x, T> {
+    pub fn new(tokens: T) -> Self {
+        Self {
+            buf: Vec::with_capacity(64),
+            tokens,
+        }
+    }
+}
+
+impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> Iterator for BloomHasher<'x, T> {
+    type Item = OsbToken<TokenHash>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.tokens.next().map(|token| {
+            let bytes = match token.inner {
+                Gram::Uni { t1 } => t1.as_bytes(),
+                Gram::Bi { t1, t2, .. } => {
+                    self.buf.clear();
+                    self.buf.extend_from_slice(t1.as_bytes());
+                    self.buf.push(b' ');
+                    self.buf.extend_from_slice(t2.as_bytes());
+                    &self.buf
+                }
+            };
+
+            OsbToken {
+                inner: TokenHash {
+                    h1: xxhash_rust::xxh3::xxh3_64(bytes),
+                    h2: farmhash::hash64(bytes),
+                },
+                idx: token.idx,
+            }
+        })
+    }
+}
+
+impl std::hash::Hash for TokenHash {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_u64(self.h1 ^ self.h2);
+    }
+}
+
+impl IsEnabled for TokenHash {}
diff --git a/crates/nlp/src/bayes/classify.rs b/crates/nlp/src/bayes/classify.rs
new file mode 100644
index 00000000..38f5da85
--- /dev/null
+++ b/crates/nlp/src/bayes/classify.rs
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use crate::transformers::osb::OsbToken;
+
+use super::{BayesClassifier, Weights};
+
+// Position 0 represents Unigram weights
+const FEATURE_WEIGHT: [f64; 8] = [1.0, 3125.0, 256.0, 27.0, 1.0, 0.0, 0.0, 0.0];
+
+impl BayesClassifier {
+    pub fn classify<T>(&self, tokens: T, ham_learns: u32, spam_learns: u32) -> Option<f64>
+    where
+        T: Iterator<Item = OsbToken<Weights>>,
+    {
+        if self.min_learns > 0 && (spam_learns < self.min_learns || ham_learns < self.min_learns) {
+            return None;
+        }
+
+        let mut processed_tokens = 0;
+        let mut total_spam_prob = 0.0;
+        let mut total_ham_prob = 0.0;
+
+        for token in tokens {
+            let weights = token.inner;
+            let total_count = weights.spam + weights.ham;
+
+            if total_count >= self.min_token_hits {
+                let total_count = total_count as f64;
+                let spam_freq = weights.spam as f64 / f64::max(1.0, spam_learns as f64);
+                let ham_freq = weights.ham as f64 / f64::max(1.0, ham_learns as f64);
+                let spam_prob = spam_freq / (spam_freq + ham_freq);
+                let ham_prob = ham_freq / (spam_freq + ham_freq);
+
+                let fw = FEATURE_WEIGHT[token.idx];
+                let w = (fw * total_count) / (1.0 + fw * total_count);
+                let bayes_spam_prob = prob_combine(spam_prob, total_count, w, 0.5);
+
+                if !((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + self.min_prob_strength)
+                    || (bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - self.min_prob_strength))
+                {
+                    let bayes_ham_prob = prob_combine(ham_prob, total_count, w, 0.5);
+                    total_spam_prob += bayes_spam_prob.ln();
+                    total_ham_prob += bayes_ham_prob.ln();
+                    processed_tokens += 1;
+                }
+            }
+        }
+
+        if processed_tokens == 0
+            || self.min_tokens > 0 && processed_tokens < (self.min_tokens as f64 * 0.1) as u32
+        {
+            return None;
+        }
+
+        let (h, s) = if total_spam_prob > -300.0 && total_ham_prob > -300.0 {
+            /* Fisher value is low enough to apply inv_chi_square */
+            (
+                1.0 - inv_chi_square(total_spam_prob, processed_tokens),
+                1.0 - inv_chi_square(total_ham_prob, processed_tokens),
+            )
+        } else {
+            /* Use naive method */
+            if total_spam_prob < total_ham_prob {
+                let h = (1.0 - (total_spam_prob - total_ham_prob).exp())
+                    / (1.0 + (total_spam_prob - total_ham_prob).exp());
+                (h, 1.0 - h)
+            } else {
+                let s = (1.0 - (total_ham_prob - total_spam_prob).exp())
+                    / (1.0 + (total_ham_prob - total_spam_prob).exp());
+                (1.0 - s, s)
+            }
+        };
+
+        let final_prob = if h.is_finite() && s.is_finite() {
+            (s + 1.0 - h) / 2.0
+        } else {
+            /*
+             * We have some overflow, hence we need to check which class
+             * is NaN
+             */
+
+            if h.is_finite() {
+                1.0
+            } else if s.is_finite() {
+                0.0
+            } else {
+                0.5
+            }
+        };
+
+        if processed_tokens > 0 && (final_prob - 0.5).abs() > 0.05 {
+            Some(final_prob)
+        } else {
+            None
+        }
+    }
+}
+
+/**
+ * Returns probability of chisquare > value with specified number of freedom
+ * degrees
+ */
+#[inline(always)]
+fn inv_chi_square(value: f64, freedom_deg: u32) -> f64 {
+    let mut prob = value.exp();
+
+    if prob.is_finite() {
+        /*
+         * m is our confidence in class
+         * prob is e ^ x (small value since x is normally less than zero
+         * So we integrate over degrees of freedom and produce the total result
+         * from 1.0 (no confidence) to 0.0 (full confidence)
+         */
+
+        let mut sum = prob;
+        let m = -value;
+
+        for i in 1..freedom_deg {
+            prob *= m / i as f64;
+            sum += prob;
+        }
+
+        f64::min(1.0, sum)
+    } else {
+        /*
+         * e^x where x is large *NEGATIVE* number is OK, so we have a very strong
+         * confidence that inv-chi-square is close to zero
+         */
+
+        if value < 0.0 {
+            0.0
+        } else {
+            1.0
+        }
+    }
+}
+
+/*#[inline(always)]
+fn normalize_probability(x: f64, bias: f64) -> f64 {
+    ((x - bias) * 2.0).powi(8)
+}*/
+
+#[inline(always)]
+fn prob_combine(prob: f64, cnt: f64, weight: f64, assumed: f64) -> f64 {
+    ((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt))
+}
diff --git a/crates/nlp/src/bayes/mod.rs b/crates/nlp/src/bayes/mod.rs
new file mode 100644
index 00000000..3fb419d2
--- /dev/null
+++ b/crates/nlp/src/bayes/mod.rs
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::{collections::HashMap, hash::BuildHasherDefault};
+
+use nohash::NoHashHasher;
+use serde::{Deserialize, Serialize};
+
+pub mod bloom;
+pub mod classify;
+pub mod train;
+
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub struct BayesModel {
+    pub weights: HashMap<TokenHash, Weights, BuildHasherDefault<NoHashHasher<TokenHash>>>,
+    pub spam_learns: u32,
+    pub ham_learns: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct BayesClassifier {
+    pub min_token_hits: u32,
+    pub min_tokens: u32,
+    pub min_prob_strength: f64,
+    pub min_learns: u32,
+}
+
+#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone, PartialEq, Eq)]
+pub struct TokenHash {
+    h1: u64,
+    h2: u64,
+}
+
+#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone)]
+pub struct Weights {
+    spam: u32,
+    ham: u32,
+}
+
+impl BayesClassifier {
+    pub fn new() -> Self {
+        BayesClassifier {
+            min_token_hits: 2,
+            min_tokens: 11,
+            min_prob_strength: 0.05,
+            min_learns: 200,
+        }
+    }
+}
+
+impl Default for BayesClassifier {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/crates/nlp/src/bayes/train.rs b/crates/nlp/src/bayes/train.rs
new file mode 100644
index 00000000..7ba0881d
--- /dev/null
+++ b/crates/nlp/src/bayes/train.rs
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use crate::transformers::osb::OsbToken;
+
+use super::{BayesModel, TokenHash};
+
+impl BayesModel {
+    pub fn train<T>(&mut self, tokens: T, is_spam: bool)
+    where
+        T: IntoIterator<Item = OsbToken<TokenHash>>,
+    {
+        if is_spam {
+            self.spam_learns += 1;
+        } else {
+            self.ham_learns += 1;
+        }
+
+        for token in tokens {
+            let hs = self.weights.entry(token.inner).or_default();
+            if is_spam {
+                hs.spam += 1;
+            } else {
+                hs.ham += 1;
+            }
+        }
+    }
+
+    pub fn untrain<T>(&mut self, tokens: T, is_spam: bool)
+    where
+        T: IntoIterator<Item = OsbToken<TokenHash>>,
+    {
+        if is_spam {
+            self.spam_learns -= 1;
+        } else {
+            self.ham_learns -= 1;
+        }
+
+        for token in tokens {
+            let hs = self.weights.entry(token.inner).or_default();
+            if is_spam {
+                hs.spam -= 1;
+            } else {
+                hs.ham -= 1;
+            }
+        }
+    }
+}
diff --git a/crates/store/src/fts/lang.rs b/crates/nlp/src/language/detect.rs
similarity index 100%
rename from crates/store/src/fts/lang.rs
rename to crates/nlp/src/language/detect.rs
diff --git a/crates/nlp/src/language/mod.rs b/crates/nlp/src/language/mod.rs
new file mode 100644
index 00000000..edc87368
--- /dev/null
+++ b/crates/nlp/src/language/mod.rs
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::borrow::Cow;
+
+use crate::tokenizers::{
+    chinese::ChineseTokenizer, japanese::JapaneseTokenizer, word::WordTokenizer, Token,
+};
+
+use self::detect::LanguageDetector;
+
+pub mod detect;
+pub mod stemmer;
+
+pub type LanguageTokenizer<'x> = Box<dyn Iterator<Item = Token<Cow<'x, str>>> + 'x>;
+
+impl Language {
+    pub fn tokenize_text<'x>(
+        &self,
+        text: &'x str,
+        max_token_length: usize,
+    ) -> LanguageTokenizer<'x> {
+        match self {
+            Language::Japanese => Box::new(
+                JapaneseTokenizer::new(WordTokenizer::new(text, usize::MAX))
+                    .filter(move |t| t.word.len() <= max_token_length),
+            ),
+            Language::Mandarin => Box::new(
+                ChineseTokenizer::new(WordTokenizer::new(text, usize::MAX))
+                    .filter(move |t| t.word.len() <= max_token_length),
+            ),
+            _ => Box::new(WordTokenizer::new(text, max_token_length)),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
+pub enum Language {
+    Esperanto = 0,
+    English = 1,
+    Russian = 2,
+    Mandarin = 3,
+    Spanish = 4,
+    Portuguese = 5,
+    Italian = 6,
+    Bengali = 7,
+    French = 8,
+    German = 9,
+    Ukrainian = 10,
+    Georgian = 11,
+    Arabic = 12,
+    Hindi = 13,
+    Japanese = 14,
+    Hebrew = 15,
+    Yiddish = 16,
+    Polish = 17,
+    Amharic = 18,
+    Javanese = 19,
+    Korean = 20,
+    Bokmal = 21,
+    Danish = 22,
+    Swedish = 23,
+    Finnish = 24,
+    Turkish = 25,
+    Dutch = 26,
+    Hungarian = 27,
+    Czech = 28,
+    Greek = 29,
+    Bulgarian = 30,
+    Belarusian = 31,
+    Marathi = 32,
+    Kannada = 33,
+    Romanian = 34,
+    Slovene = 35,
+    Croatian = 36,
+    Serbian = 37,
+    Macedonian = 38,
+    Lithuanian = 39,
+    Latvian = 40,
+    Estonian = 41,
+    Tamil = 42,
+    Vietnamese = 43,
+    Urdu = 44,
+    Thai = 45,
+    Gujarati = 46,
+    Uzbek = 47,
+    Punjabi = 48,
+    Azerbaijani = 49,
+    Indonesian = 50,
+    Telugu = 51,
+    Persian = 52,
+    Malayalam = 53,
+    Oriya = 54,
+    Burmese = 55,
+    Nepali = 56,
+    Sinhalese = 57,
+    Khmer = 58,
+    Turkmen = 59,
+    Akan = 60,
+    Zulu = 61,
+    Shona = 62,
+    Afrikaans = 63,
+    Latin = 64,
+    Slovak = 65,
+    Catalan = 66,
+    Tagalog = 67,
+    Armenian = 68,
+    Unknown = 69,
+    None = 70,
+}
+
+impl Language {
+    pub fn from_iso_639(code: &str) -> Option<Self> {
+        match code.split_once('-').map(|c| c.0).unwrap_or(code) {
+            "en" => Language::English,
+            "es" => Language::Spanish,
+            "pt" => Language::Portuguese,
+            "it" => Language::Italian,
+            "fr" => Language::French,
+            "de" => Language::German,
+            "ru" => Language::Russian,
+            "zh" => Language::Mandarin,
+            "ja" => Language::Japanese,
+            "ar" => Language::Arabic,
+            "hi" => Language::Hindi,
+            "ko" => Language::Korean,
+            "bn" => Language::Bengali,
+            "he" => Language::Hebrew,
+            "ur" => Language::Urdu,
+            "fa" => Language::Persian,
+            "ml" => Language::Malayalam,
+            "or" => Language::Oriya,
+            "my" => Language::Burmese,
+            "ne" => Language::Nepali,
+            "si" => Language::Sinhalese,
+            "km" => Language::Khmer,
+            "tk" => Language::Turkmen,
+            "am" => Language::Amharic,
+            "az" => Language::Azerbaijani,
+            "id" => Language::Indonesian,
+            "te" => Language::Telugu,
+            "ta" => Language::Tamil,
+            "vi" => Language::Vietnamese,
+            "gu" => Language::Gujarati,
+            "pa" => Language::Punjabi,
+            "uz" => Language::Uzbek,
+            "hy" => Language::Armenian,
+            "ka" => Language::Georgian,
+            "la" => Language::Latin,
+            "sl" => Language::Slovene,
+            "hr" => Language::Croatian,
+            "sr" => Language::Serbian,
+            "mk" => Language::Macedonian,
+            "lt" => Language::Lithuanian,
+            "lv" => Language::Latvian,
+            "et" => Language::Estonian,
+            "tl" => Language::Tagalog,
+            "af" => Language::Afrikaans,
+            "zu" => Language::Zulu,
+            "sn" => Language::Shona,
+            "ak" => Language::Akan,
+            _ => return None,
+        }
+        .into()
+    }
+}
+
+impl Language {
+    pub fn detect(text: String, default: Language) -> (String, Language) {
+        if let Some((l, t)) = text
+            .split_once(':')
+            .and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
+        {
+            (t.to_string(), l)
+        } else {
+            let l = LanguageDetector::detect_single(&text)
+                .and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
+                .unwrap_or(default);
+            (text, l)
+        }
+    }
+}
diff --git a/crates/store/src/fts/stemmer.rs b/crates/nlp/src/language/stemmer.rs
similarity index 93%
rename from crates/store/src/fts/stemmer.rs
rename to crates/nlp/src/language/stemmer.rs
index aa056d22..cd3da5e2 100644
--- a/crates/store/src/fts/stemmer.rs
+++ b/crates/nlp/src/language/stemmer.rs
@@ -25,25 +25,25 @@ use std::borrow::Cow;
 
 use rust_stemmers::Algorithm;
 
-use super::{tokenizers::Tokenizer, Language};
+use super::{Language, LanguageTokenizer};
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct StemmedToken<'x> {
     pub word: Cow<'x, str>,
     pub stemmed_word: Option<Cow<'x, str>>,
-    pub offset: u32, // Word offset in the text part
-    pub len: u8,     // Word length
+    pub from: usize, // Word offset in the text part
+    pub to: usize,   // Word length
 }
 
 pub struct Stemmer<'x> {
     stemmer: Option<rust_stemmers::Stemmer>,
-    tokenizer: Tokenizer<'x>,
+    tokenizer: LanguageTokenizer<'x>,
 }
 
 impl<'x> Stemmer<'x> {
     pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Stemmer<'x> {
         Stemmer {
-            tokenizer: Tokenizer::new(text, language, max_token_length),
+            tokenizer: language.tokenize_text(text, max_token_length),
             stemmer: STEMMER_MAP[language as usize].map(rust_stemmers::Stemmer::create),
         }
     }
@@ -57,15 +57,15 @@ impl<'x> Iterator for Stemmer<'x> {
         Some(StemmedToken {
             stemmed_word: self.stemmer.as_ref().and_then(|stemmer| {
                 match stemmer.stem(&token.word) {
-                    Cow::Owned(text) if text.len() != token.len as usize || text != token.word => {
+                    Cow::Owned(text) if text.len() != token.word.len() || text != token.word => {
                         Some(text.into())
                     }
                     _ => None,
                 }
             }),
             word: token.word,
-            offset: token.offset,
-            len: token.len,
+            from: token.from,
+            to: token.to,
         })
     }
 }
diff --git a/crates/nlp/src/lib.rs b/crates/nlp/src/lib.rs
new file mode 100644
index 00000000..d933ea0e
--- /dev/null
+++ b/crates/nlp/src/lib.rs
@@ -0,0 +1,78 @@
+use ahash::AHashSet;
+
+pub mod bayes;
+pub mod language;
+pub mod tokenizers;
+pub mod transformers;
+
+#[derive(Debug, Clone, Default)]
+pub struct PublicSuffix {
+    pub suffixes: AHashSet<String>,
+    pub exceptions: AHashSet<String>,
+    pub wildcards: Vec<String>,
+}
+
+impl PublicSuffix {
+    pub fn contains(&self, suffix: &str) -> bool {
+        self.suffixes.contains(suffix)
+            || (!self.exceptions.contains(suffix)
+                && self.wildcards.iter().any(|w| suffix.ends_with(w)))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::fs;
+
+    use crate::{
+        bayes::{bloom::BloomHasher, BayesClassifier, BayesModel},
+        transformers::osb::{OsbToken, OsbTokenizer},
+    };
+
+    #[test]
+    #[ignore]
+    fn train() {
+        let db = fs::read_to_string("spam_or_not_spam.csv").unwrap();
+        let mut bayes = BayesModel::default();
+
+        for line in db.lines() {
+            let (text, is_spam) = line.rsplit_once(',').unwrap();
+            let is_spam = is_spam == "1";
+
+            bayes.train(
+                BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)),
+                is_spam,
+            );
+        }
+        println!("Ham: {} Spam: {}", bayes.ham_learns, bayes.spam_learns,);
+        fs::write("spam_or_not_spam.bin", bincode::serialize(&bayes).unwrap()).unwrap();
+    }
+
+    #[test]
+    #[ignore]
+    fn classify() {
+        let model: BayesModel =
+            bincode::deserialize(&fs::read("spam_or_not_spam.bin").unwrap()).unwrap();
+        let bayes = BayesClassifier::new();
+
+        for text in [
+            "i am attaching to this email a presentation to integrate the spreadsheet into our server",
+            "buy this great product special offer sales",
+            "i m using simple dns from jhsoft we support only a few web sites and i d like to swap secondary services with someone in a similar position",
+            "viagra xenical vioxx zyban propecia we only offer the real viagra xenical ",
+        ] {
+            println!(
+                "{:?} -> {}",
+                text,
+                bayes
+                    .classify(BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)).filter_map(|x| model.weights.get(&x.inner).map(|w| {
+                        OsbToken {
+                            idx: x.idx,
+                            inner: *w,
+                        }
+                    })), model.ham_learns, model.spam_learns)
+                    .unwrap()
+            );
+        }
+    }
+}
diff --git a/crates/nlp/src/tokenizers/chinese.rs b/crates/nlp/src/tokenizers/chinese.rs
new file mode 100644
index 00000000..f9ff355b
--- /dev/null
+++ b/crates/nlp/src/tokenizers/chinese.rs
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2023, Stalwart Labs Ltd.
+ *
+ * This file is part of Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::{borrow::Cow, vec::IntoIter};
+
+use jieba_rs::Jieba;
+
+use super::{InnerToken, Token};
+use lazy_static::lazy_static;
+
+lazy_static! {
+    static ref JIEBA: Jieba = Jieba::new();
+}
+
+pub struct ChineseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    tokenizer: T,
+    tokens: IntoIter<Token<I>>,
+    phantom: std::marker::PhantomData<&'x str>,
+}
+
+impl<'x, T, I> ChineseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    pub fn new(tokenizer: T) -> Self {
+        ChineseTokenizer {
+            tokenizer,
+            tokens: Vec::new().into_iter(),
+            phantom: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<'x, T, I> Iterator for ChineseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    type Item = Token<I>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some(token) = self.tokens.next() {
+                return Some(token);
+            } else {
+                let token = self.tokenizer.next()?;
+                if token.word.is_alphabetic_8bit() {
+                    let mut token_to = token.from;
+                    match token.word.unwrap_alphabetic() {
+                        Cow::Borrowed(word) => {
+                            self.tokens = JIEBA
+                                .cut(word, false)
+                                .into_iter()
+                                .map(|word| {
+                                    let token_from = token_to;
+                                    token_to += word.len();
+                                    Token {
+                                        word: I::new_alphabetic(word),
+                                        from: token_from,
+                                        to: token_to,
+                                    }
+                                })
+                                .collect::<Vec<_>>()
+                                .into_iter();
+                        }
+                        Cow::Owned(word) => {
+                            self.tokens = JIEBA
+                                .cut(&word, false)
+                                .into_iter()
+                                .map(|word| {
+                                    let token_from = token_to;
+                                    token_to += word.len();
+                                    Token {
+                                        word: I::new_alphabetic(word.to_string()),
+                                        from: token_from,
+                                        to: token_to,
+                                    }
+                                })
+                                .collect::<Vec<_>>()
+                                .into_iter();
+                        }
+                    }
+                } else {
+                    return token.into();
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::tokenizers::{chinese::ChineseTokenizer, word::WordTokenizer, Token};
+
+    #[test]
+    fn chinese_tokenizer() {
+        assert_eq!(
+            ChineseTokenizer::new(WordTokenizer::new(
+                "孫子曰：兵者，國之大事，死生之地，存亡之道，不可不察也。",
+                40
+            ),)
+            .collect::<Vec<_>>(),
+            vec![
+                Token {
+                    word: "孫".into(),
+                    from: 0,
+                    to: 3
+                },
+                Token {
+                    word: "子".into(),
+                    from: 3,
+                    to: 6
+                },
+                Token {
+                    word: "曰".into(),
+                    from: 6,
+                    to: 9
+                },
+                Token {
+                    word: "兵".into(),
+                    from: 12,
+                    to: 15
+                },
+                Token {
+                    word: "者".into(),
+                    from: 15,
+                    to: 18
+                },
+                Token {
+                    word: "國".into(),
+                    from: 21,
+                    to: 24
+                },
+                Token {
+                    word: "之".into(),
+                    from: 24,
+                    to: 27
+                },
+                Token {
+                    word: "大事".into(),
+                    from: 27,
+                    to: 33
+                },
+                Token {
+                    word: "死".into(),
+                    from: 36,
+                    to: 39
+                },
+                Token {
+                    word: "生".into(),
+                    from: 39,
+                    to: 42
+                },
+                Token {
+                    word: "之".into(),
+                    from: 42,
+                    to: 45
+                },
+                Token {
+                    word: "地".into(),
+                    from: 45,
+                    to: 48
+                },
+                Token {
+                    word: "存亡".into(),
+                    from: 51,
+                    to: 57
+                },
+                Token {
+                    word: "之".into(),
+                    from: 57,
+                    to: 60
+                },
+                Token {
+                    word: "道".into(),
+                    from: 60,
+                    to: 63
+                },
+                Token {
+                    word: "不可不".into(),
+                    from: 66,
+                    to: 75
+                },
+                Token {
+                    word: "察".into(),
+                    from: 75,
+                    to: 78
+                },
+                Token {
+                    word: "也".into(),
+                    from: 78,
+                    to: 81
+                }
+            ]
+        );
+    }
+}
diff --git a/crates/nlp/src/tokenizers/japanese.rs b/crates/nlp/src/tokenizers/japanese.rs
new file mode 100644
index 00000000..d0762c86
--- /dev/null
+++ b/crates/nlp/src/tokenizers/japanese.rs
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2023, Stalwart Labs Ltd.
+ *
+ * This file is part of Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::vec::IntoIter;
+
+use super::{InnerToken, Token};
+
+pub struct JapaneseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    tokenizer: T,
+    tokens: IntoIter<Token<I>>,
+    phantom: std::marker::PhantomData<&'x str>,
+}
+
+impl<'x, T, I> JapaneseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    pub fn new(tokenizer: T) -> Self {
+        JapaneseTokenizer {
+            tokenizer,
+            tokens: Vec::new().into_iter(),
+            phantom: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<'x, T, I> Iterator for JapaneseTokenizer<'x, T, I>
+where
+    T: Iterator<Item = Token<I>>,
+    I: InnerToken<'x>,
+{
+    type Item = Token<I>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            if let Some(token) = self.tokens.next() {
+                return Some(token);
+            } else {
+                let token = self.tokenizer.next()?;
+                if token.word.is_alphabetic_8bit() {
+                    let mut token_to = token.from;
+                    self.tokens = tinysegmenter::tokenize(token.word.unwrap_alphabetic().as_ref())
+                        .into_iter()
+                        .map(|word| {
+                            let token_from = token_to;
+                            token_to += word.len();
+                            Token {
+                                word: I::new_alphabetic(word.to_string()),
+                                from: token_from,
+                                to: token_to,
+                            }
+                        })
+                        .collect::<Vec<_>>()
+                        .into_iter();
+                } else {
+                    return token.into();
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::tokenizers::{japanese::JapaneseTokenizer, word::WordTokenizer, Token};
+
+    #[test]
+    fn japanese_tokenizer() {
+        assert_eq!(
+            JapaneseTokenizer::new(WordTokenizer::new(
+                "お先に失礼します あなたの名前は何ですか 123 abc-872",
+                40
+            ))
+            .collect::<Vec<_>>(),
+            vec![
+                Token {
+                    word: "お先".into(),
+                    from: 0,
+                    to: 6
+                },
+                Token {
+                    word: "に".into(),
+                    from: 6,
+                    to: 9
+                },
+                Token {
+                    word: "失礼".into(),
+                    from: 9,
+                    to: 15
+                },
+                Token {
+                    word: "し".into(),
+                    from: 15,
+                    to: 18
+                },
+                Token {
+                    word: "ます".into(),
+                    from: 18,
+                    to: 24
+                },
+                Token {
+                    word: "あなた".into(),
+                    from: 25,
+                    to: 34
+                },
+                Token {
+                    word: "の".into(),
+                    from: 34,
+                    to: 37
+                },
+                Token {
+                    word: "名前".into(),
+                    from: 37,
+                    to: 43
+                },
+                Token {
+                    word: "は".into(),
+                    from: 43,
+                    to: 46
+                },
+                Token {
+                    word: "何".into(),
+                    from: 46,
+                    to: 49
+                },
+                Token {
+                    word: "です".into(),
+                    from: 49,
+                    to: 55
+                },
+                Token {
+                    word: "か".into(),
+                    from: 55,
+                    to: 58
+                },
+                Token {
+                    word: "123".into(),
+                    from: 59,
+                    to: 62
+                },
+                Token {
+                    word: "abc".into(),
+                    from: 63,
+                    to: 66
+                },
+                Token {
+                    word: "872".into(),
+                    from: 67,
+                    to: 70
+                }
+            ]
+        );
+    }
+}
diff --git a/crates/nlp/src/tokenizers/mod.rs b/crates/nlp/src/tokenizers/mod.rs
new file mode 100644
index 00000000..a3e42d47
--- /dev/null
+++ b/crates/nlp/src/tokenizers/mod.rs
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+pub mod chinese;
+pub mod japanese;
+pub mod space;
+pub mod types;
+pub mod word;
+
+use std::borrow::Cow;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Token<T> {
+    pub word: T,
+    pub from: usize,
+    pub to: usize,
+}
+
+pub trait InnerToken<'x>: Sized {
+    fn new_alphabetic(value: impl Into<Cow<'x, str>>) -> Self;
+    fn unwrap_alphabetic(self) -> Cow<'x, str>;
+    fn is_alphabetic(&self) -> bool;
+    fn is_alphabetic_8bit(&self) -> bool;
+}
+
+impl<'x> InnerToken<'x> for Cow<'x, str> {
+    fn new_alphabetic(value: impl Into<Cow<'x, str>>) -> Self {
+        value.into()
+    }
+
+    fn is_alphabetic(&self) -> bool {
+        true
+    }
+
+    fn is_alphabetic_8bit(&self) -> bool {
+        !self.chars().all(|c| c.is_ascii())
+    }
+
+    fn unwrap_alphabetic(self) -> Cow<'x, str> {
+        self
+    }
+}
+
+impl<T> Token<T> {
+    pub fn new(offset: usize, len: usize, word: T) -> Token<T> {
+        debug_assert!(offset <= u32::max_value() as usize);
+        debug_assert!(len <= u8::max_value() as usize);
+        Token {
+            from: offset,
+            to: offset + len,
+            word,
+        }
+    }
+}
diff --git a/crates/store/src/fts/tokenizers/space.rs b/crates/nlp/src/tokenizers/space.rs
similarity index 100%
rename from crates/store/src/fts/tokenizers/space.rs
rename to crates/nlp/src/tokenizers/space.rs
diff --git a/crates/nlp/src/tokenizers/types.rs b/crates/nlp/src/tokenizers/types.rs
new file mode 100644
index 00000000..97e9bccb
--- /dev/null
+++ b/crates/nlp/src/tokenizers/types.rs
@@ -0,0 +1,2878 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::str::CharIndices;
+
+use crate::PublicSuffix;
+
+use super::Token;
+
+pub struct TypesTokenizer<'x, 'y> {
+    text: &'x str,
+    suffixes: &'y PublicSuffix,
+    iter: CharIndices<'x>,
+    tokens: Vec<Token<TokenType<'x>>>,
+    peek_pos: usize,
+    last_ch_is_space: bool,
+    last_token_is_dot: bool,
+    eof: bool,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TokenType<'x> {
+    Alphabetic(&'x str),
+    Integer(&'x str),
+    Alphanumeric(&'x str),
+    Hexadecimal(&'x str),
+    Other(char),
+    Punctuation(char),
+    Space,
+
+    // Detected types
+    Url(&'x str),
+    UrlNoScheme(&'x str),
+    UrlNoHost(&'x str),
+    Email(&'x str),
+    Float(&'x str),
+}
+
+impl Copy for Token<TokenType<'_>> {}
+
+impl<'x, 'y> Iterator for TypesTokenizer<'x, 'y> {
+    type Item = Token<TokenType<'x>>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let token = self.peek()?;
+        let last_is_dot = self.last_token_is_dot;
+        self.last_token_is_dot = matches!(token.word, TokenType::Punctuation('.'));
+
+        // Try parsing URL with scheme
+        if matches!(
+            token.word,
+            TokenType::Alphabetic(t) | TokenType::Hexadecimal(t)
+            if t.len() <= 8 && t.chars().all(|c| c.is_ascii()))
+            && self.try_skip_url_scheme()
+        {
+            if let Some(url) = self.try_parse_url(token.into()) {
+                self.peek_advance();
+                return Some(url);
+            } else {
+                self.peek_rewind();
+            }
+        }
+
+        // Try parsing email
+        if token.word.is_email_atom()
+            && self.peek_has_tokens(
+                &[TokenType::Punctuation('@'), TokenType::Punctuation('.')],
+                TokenType::Space,
+            )
+        {
+            if let Some(email) = self.try_parse_email() {
+                self.peek_advance();
+                return Some(email);
+            } else {
+                self.peek_rewind();
+            }
+        }
+
+        // Try parsing URL without scheme
+        if token.word.is_domain_atom(true)
+            && self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space)
+        {
+            if let Some(url) = self.try_parse_url(None) {
+                self.peek_advance();
+                return Some(url);
+            } else {
+                self.peek_rewind();
+            }
+        }
+
+        // Try parsing currencies and floating point numbers
+        if !last_is_dot {
+            if let Some(num) = self.try_parse_number() {
+                self.peek_advance();
+                return Some(num);
+            }
+        }
+
+        self.peek_rewind();
+        self.next_()
+    }
+}
+
+impl<'x, 'y> TypesTokenizer<'x, 'y> {
+    pub fn new(text: &'x str, suffixes: &'y PublicSuffix) -> Self {
+        Self {
+            text,
+            iter: text.char_indices(),
+            tokens: Vec::new(),
+            eof: false,
+            peek_pos: 0,
+            suffixes,
+            last_ch_is_space: false,
+            last_token_is_dot: false,
+        }
+    }
+
+    fn consume(&mut self) -> bool {
+        let mut has_alpha = false;
+        let mut has_number = false;
+        let mut has_hex = false;
+
+        let mut start_pos = usize::MAX;
+        let mut end_pos = usize::MAX;
+
+        let mut stop_char = None;
+
+        for (pos, ch) in self.iter.by_ref() {
+            if ch.is_alphabetic() {
+                if ch.is_ascii_hexdigit() {
+                    has_hex = true;
+                } else {
+                    has_alpha = true;
+                }
+            } else if ch.is_ascii_digit() {
+                has_number = true;
+            } else {
+                let last_was_space = self.last_ch_is_space;
+                self.last_ch_is_space = ch.is_ascii_whitespace();
+                stop_char = Token {
+                    word: if self.last_ch_is_space {
+                        if last_was_space {
+                            continue;
+                        } else {
+                            TokenType::Space
+                        }
+                    } else if ch.is_ascii() {
+                        TokenType::Punctuation(ch)
+                    } else {
+                        TokenType::Other(ch)
+                    },
+                    from: pos,
+                    to: pos + ch.len_utf8(),
+                }
+                .into();
+                break;
+            }
+            self.last_ch_is_space = false;
+
+            if start_pos == usize::MAX {
+                start_pos = pos;
+            }
+            end_pos = pos + ch.len_utf8();
+        }
+
+        if start_pos != usize::MAX {
+            let text = &self.text[start_pos..end_pos];
+
+            self.tokens.push(Token {
+                word: if has_alpha && has_number {
+                    TokenType::Alphanumeric(text)
+                } else if has_alpha {
+                    TokenType::Alphabetic(text)
+                } else if has_hex {
+                    TokenType::Hexadecimal(text)
+                } else {
+                    TokenType::Integer(text)
+                },
+                from: start_pos,
+                to: end_pos,
+            });
+            if let Some(stop_char) = stop_char {
+                self.tokens.push(stop_char);
+            }
+            true
+        } else if let Some(stop_char) = stop_char {
+            self.tokens.push(stop_char);
+            true
+        } else {
+            self.eof = true;
+            false
+        }
+    }
+
+    fn next_(&mut self) -> Option<Token<TokenType<'x>>> {
+        if self.tokens.is_empty() && !self.eof {
+            self.consume();
+        }
+        if !self.tokens.is_empty() {
+            Some(self.tokens.remove(0))
+        } else {
+            None
+        }
+    }
+
+    fn peek(&mut self) -> Option<Token<TokenType<'x>>> {
+        while self.tokens.len() <= self.peek_pos && !self.eof {
+            self.consume();
+        }
+        self.tokens.get(self.peek_pos).map(|t| {
+            self.peek_pos += 1;
+            *t
+        })
+    }
+
+    fn peek_advance(&mut self) {
+        if self.peek_pos > 0 {
+            self.tokens.drain(..self.peek_pos);
+            self.peek_pos = 0;
+        }
+    }
+
+    fn peek_rewind(&mut self) {
+        self.peek_pos = 0;
+    }
+
+    fn peek_has_tokens(&mut self, tokens: &[TokenType<'_>], stop_token: TokenType<'_>) -> bool {
+        let mut tokens = tokens.iter().copied();
+        let mut token = tokens.next().unwrap();
+        while let Some(t) = self.peek() {
+            if t.word == token {
+                if let Some(next_token) = tokens.next() {
+                    token = next_token;
+                } else {
+                    self.peek_rewind();
+                    return true;
+                }
+            } else if t.word == stop_token {
+                break;
+            }
+        }
+
+        self.peek_rewind();
+        false
+    }
+
+    fn try_parse_url(
+        &mut self,
+        scheme_token: Option<Token<TokenType<'_>>>,
+    ) -> Option<Token<TokenType<'x>>> {
+        let (has_scheme, allow_blank_host) = scheme_token.as_ref().map_or((false, false), |t| {
+            (
+                true,
+                matches!(t.word, TokenType::Alphabetic(s) if s.eq_ignore_ascii_case("file")),
+            )
+        });
+        if has_scheme {
+            let restore_pos = self.peek_pos;
+            let mut has_user_info = false;
+            while let Some(token) = self.peek() {
+                match token.word {
+                    TokenType::Punctuation('@') => {
+                        has_user_info = true;
+                        break;
+                    }
+                    TokenType::Alphabetic(_)
+                    | TokenType::Alphanumeric(_)
+                    | TokenType::Integer(_)
+                    | TokenType::Hexadecimal(_)
+                    | TokenType::Punctuation(
+                        '-' | '.' | '_' | '~' | '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+'
+                        | ',' | ';' | '=' | ':',
+                    ) => (),
+                    _ => break,
+                }
+            }
+
+            if !has_user_info {
+                self.peek_pos = restore_pos;
+            }
+        }
+
+        // Try parsing hostname
+        let mut is_valid_host = true;
+        let (host_start_pos, mut end_pos) = if has_scheme {
+            let mut start_pos = usize::MAX;
+            let mut end_pos = usize::MAX;
+            let mut restore_pos = self.peek_pos;
+
+            let mut text_count = 0;
+            let mut int_count = 0;
+            let mut dot_count = 0;
+            let mut is_ipv6 = false;
+
+            let mut last_label_is_tld = false;
+
+            while let Some(token) = self.peek() {
+                match token.word {
+                    TokenType::Alphabetic(text)
+                    | TokenType::Alphanumeric(text)
+                    | TokenType::Hexadecimal(text) => {
+                        last_label_is_tld =
+                            text.len() >= 2 && self.suffixes.contains(&text.to_ascii_lowercase());
+                        text_count += 1;
+                    }
+                    TokenType::Integer(text) => {
+                        if text.len() <= 3 {
+                            int_count += 1;
+                        }
+                    }
+                    TokenType::Punctuation('.') => {
+                        dot_count += 1;
+                        continue;
+                    }
+                    TokenType::Punctuation('[') if start_pos == usize::MAX => {
+                        let (_, to) = self.try_parse_ipv6(token.from)?;
+                        start_pos = token.from;
+                        end_pos = to;
+                        restore_pos = self.peek_pos;
+                        is_ipv6 = true;
+                        break;
+                    }
+                    TokenType::Punctuation(
+                        '-' | '_' | '~' | '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ','
+                        | ';' | '=' | ':' | '%',
+                    ) => {
+                        continue;
+                    }
+                    TokenType::Punctuation('/') if allow_blank_host => {
+                        // Allow file://../ urls
+                        end_pos = token.from;
+                        restore_pos = self.peek_pos - 1;
+                        break;
+                    }
+                    _ => break,
+                }
+
+                if start_pos == usize::MAX {
+                    start_pos = token.from;
+                }
+                end_pos = token.to;
+                restore_pos = self.peek_pos;
+            }
+
+            self.peek_pos = restore_pos;
+            if end_pos != usize::MAX {
+                is_valid_host =
+                    (last_label_is_tld && dot_count >= 1 && (text_count + int_count) >= 2)
+                        || (int_count == 4 && dot_count == 3)
+                        || is_ipv6;
+                (start_pos, end_pos)
+            } else {
+                return None;
+            }
+        } else {
+            // Strict hostname parsing
+            self.try_parse_hostname()?
+        };
+
+        // Try parsing port
+        let start_pos = scheme_token.map(|t| t.from).unwrap_or(host_start_pos);
+        let mut restore_pos = self.peek_pos;
+        let mut has_port = false;
+        let mut last_is_colon = false;
+        let mut found_query_start = false;
+        while let Some(token) = self.peek() {
+            match token.word {
+                TokenType::Punctuation(':') if !last_is_colon && !has_port => {
+                    last_is_colon = true;
+                }
+                TokenType::Integer(_) if last_is_colon => {
+                    has_port = true;
+                    last_is_colon = false;
+                    restore_pos = self.peek_pos;
+                    end_pos = token.to;
+                }
+                TokenType::Punctuation('/' | '?') if !last_is_colon => {
+                    found_query_start = true;
+                    end_pos = token.to;
+                    break;
+                }
+                _ => {
+                    self.peek_pos = restore_pos;
+                    break;
+                }
+            }
+        }
+
+        // Try parsing query
+        if found_query_start {
+            restore_pos = self.peek_pos;
+            let mut p_count = 0;
+            let mut b_count = 0;
+            let mut c_count = 0;
+            let mut seen_quote = false;
+            while let Some(token) = self.peek() {
+                match token.word {
+                    TokenType::Alphabetic(_)
+                    | TokenType::Alphanumeric(_)
+                    | TokenType::Integer(_)
+                    | TokenType::Hexadecimal(_)
+                    | TokenType::Other(_) => {}
+                    TokenType::Punctuation('(') => {
+                        p_count += 1;
+                        continue;
+                    }
+                    TokenType::Punctuation('[') => {
+                        b_count += 1;
+                        continue;
+                    }
+                    TokenType::Punctuation('{') => {
+                        c_count += 1;
+                        continue;
+                    }
+                    TokenType::Punctuation(')') if p_count > 0 => {
+                        p_count -= 1;
+                    }
+                    TokenType::Punctuation(']') if b_count > 0 => {
+                        b_count -= 1;
+                    }
+                    TokenType::Punctuation('}') if c_count > 0 => {
+                        c_count -= 1;
+                    }
+                    TokenType::Punctuation('\'') => {
+                        if !seen_quote {
+                            seen_quote = true;
+                            continue;
+                        } else {
+                            seen_quote = false;
+                        }
+                    }
+                    TokenType::Punctuation('/') => {}
+                    TokenType::Punctuation(
+                        '-' | '_' | '~' | '!' | '$' | '&' | '*' | '+' | ',' | ';' | '=' | ':' | '%'
+                        | '?' | '.' | '@',
+                    ) => {
+                        continue;
+                    }
+                    _ => break,
+                }
+                end_pos = token.to;
+                restore_pos = self.peek_pos;
+            }
+            self.peek_pos = restore_pos;
+        }
+
+        Token {
+            word: if has_scheme {
+                if is_valid_host {
+                    TokenType::Url(&self.text[start_pos..end_pos])
+                } else {
+                    TokenType::UrlNoHost(&self.text[start_pos..end_pos])
+                }
+            } else {
+                TokenType::UrlNoScheme(&self.text[start_pos..end_pos])
+            },
+            from: start_pos,
+            to: end_pos,
+        }
+        .into()
+    }
+
+    fn try_parse_email(&mut self) -> Option<Token<TokenType<'x>>> {
+        // Start token is a valid local part atom
+        let start_token = self.peek()?;
+        let mut last_is_dot = false;
+
+        // Find local part
+        loop {
+            let token = self.peek()?;
+            match token.word {
+                word if word.is_email_atom() => {
+                    last_is_dot = false;
+                }
+                TokenType::Punctuation('@') if !last_is_dot => {
+                    break;
+                }
+                TokenType::Punctuation('.') if !last_is_dot => {
+                    last_is_dot = true;
+                }
+                _ => {
+                    return None;
+                }
+            }
+        }
+
+        // Obtain domain part
+        let (_, end_pos) = self.try_parse_hostname()?;
+
+        Token {
+            word: TokenType::Email(&self.text[start_token.from..end_pos]),
+            from: start_token.from,
+            to: end_pos,
+        }
+        .into()
+    }
+
+    fn try_parse_hostname(&mut self) -> Option<(usize, usize)> {
+        let mut last_ch = u8::MAX;
+        let mut has_int = false;
+        let mut has_alpha = false;
+        let mut last_label_is_tld = false;
+
+        let mut dot_count = 0;
+        let mut start_pos = usize::MAX;
+        let mut end_pos = usize::MAX;
+        let mut restore_pos = self.peek_pos;
+
+        while let Some(token) = self.peek() {
+            match token.word {
+                TokenType::Punctuation('.') if last_ch == 0 && start_pos != usize::MAX => {
+                    last_ch = b'.';
+                    dot_count += 1;
+                    continue;
+                }
+                TokenType::Punctuation('-') if last_ch == 0 || last_ch == b'-' => {
+                    last_ch = b'-';
+                    continue;
+                }
+                TokenType::Punctuation('[') if start_pos == usize::MAX => {
+                    return self.try_parse_ipv6(token.from);
+                }
+                TokenType::Alphabetic(text)
+                | TokenType::Alphanumeric(text)
+                | TokenType::Hexadecimal(text)
+                    if text.len() <= 63 =>
+                {
+                    last_label_is_tld =
+                        text.len() >= 2 && self.suffixes.contains(&text.to_ascii_lowercase());
+                    has_alpha = true;
+                    last_ch = 0;
+                }
+                TokenType::Other(_) => {
+                    has_alpha = true;
+                    last_label_is_tld = false;
+                    last_ch = 0;
+                }
+                TokenType::Integer(text) => {
+                    if text.len() <= 3 {
+                        has_int = true;
+                    }
+                    last_label_is_tld = false;
+                    last_ch = 0;
+                }
+                _ => {
+                    break;
+                }
+            }
+
+            if start_pos == usize::MAX {
+                start_pos = token.from;
+            }
+            end_pos = token.to;
+            restore_pos = self.peek_pos;
+        }
+        self.peek_pos = restore_pos;
+
+        if last_ch == b'.' {
+            dot_count -= 1;
+        }
+
+        if end_pos != usize::MAX
+            && dot_count >= 1
+            && (last_label_is_tld || (has_int && !has_alpha && dot_count == 3))
+        {
+            (start_pos, end_pos).into()
+        } else {
+            None
+        }
+    }
+
+    fn try_parse_ipv6(&mut self, start_pos: usize) -> Option<(usize, usize)> {
+        let mut found_colon = false;
+        let mut last_ch = u8::MAX;
+
+        while let Some(token) = self.peek() {
+            match token.word {
+                TokenType::Integer(_) | TokenType::Hexadecimal(_) => {
+                    last_ch = 0;
+                }
+                TokenType::Punctuation(':') if last_ch != b'.' => {
+                    found_colon = true;
+                    last_ch = b':';
+                }
+                TokenType::Punctuation('.') if last_ch == 0 => {
+                    last_ch = b'.';
+                }
+                TokenType::Punctuation(']') if found_colon && last_ch == 0 => {
+                    return (start_pos, token.to).into();
+                }
+                _ => return None,
+            }
+        }
+
+        None
+    }
+
+    fn try_parse_number(&mut self) -> Option<Token<TokenType<'x>>> {
+        self.peek_rewind();
+        let mut start_pos = usize::MAX;
+        let mut end_pos = usize::MAX;
+        let mut restore_pos = self.peek_pos;
+
+        let mut seen_integer = 0;
+        let mut seen_dot = false;
+
+        while let Some(token) = self.peek() {
+            match token.word {
+                TokenType::Punctuation('-') if start_pos == usize::MAX => {}
+                TokenType::Integer(_) if seen_integer == 0 || seen_dot => {
+                    seen_integer += 1;
+                }
+                TokenType::Punctuation('.') if seen_integer != 0 => {
+                    if !seen_dot {
+                        seen_dot = true;
+                        continue;
+                    } else {
+                        // Avoid parsing num.num.num as floats
+                        return None;
+                    }
+                }
+                _ => break,
+            }
+
+            if start_pos == usize::MAX {
+                start_pos = token.from;
+            }
+            end_pos = token.to;
+            restore_pos = self.peek_pos;
+        }
+
+        self.peek_pos = restore_pos;
+
+        if seen_integer > 0 {
+            let text = &self.text[start_pos..end_pos];
+
+            Token {
+                word: if seen_integer == 2 {
+                    TokenType::Float(text)
+                } else {
+                    TokenType::Integer(text)
+                },
+                from: start_pos,
+                to: end_pos,
+            }
+            .into()
+        } else {
+            None
+        }
+    }
+
+    fn try_skip_url_scheme(&mut self) -> bool {
+        enum State {
+            None,
+            PlusAlpha,
+            Colon,
+            Slash1,
+            Slash2,
+        }
+        let mut state = State::None;
+
+        while let Some(token) = self.peek() {
+            state = match (token.word, state) {
+                (TokenType::Punctuation(':'), State::None | State::Colon) => State::Slash1,
+                (TokenType::Punctuation('/'), State::Slash1) => State::Slash2,
+                (TokenType::Punctuation('/'), State::Slash2) => return true,
+                (TokenType::Punctuation('+'), State::None) => State::PlusAlpha,
+                (TokenType::Alphabetic(t) | TokenType::Hexadecimal(t), State::PlusAlpha)
+                    if t.chars().all(|c| c.is_ascii()) =>
+                {
+                    State::Colon
+                }
+                _ => break,
+            };
+        }
+        self.peek_rewind();
+        false
+    }
+}
+
+impl<'x> TokenType<'x> {
+    fn is_email_atom(&self) -> bool {
+        matches!(
+            self,
+            TokenType::Alphabetic(_)
+                | TokenType::Integer(_)
+                | TokenType::Alphanumeric(_)
+                | TokenType::Hexadecimal(_)
+                | TokenType::Other(_)
+                | TokenType::Punctuation(
+                    '!' | '#'
+                        | '$'
+                        | '%'
+                        | '&'
+                        | '\''
+                        | '*'
+                        | '+'
+                        | '-'
+                        | '/'
+                        | '='
+                        | '?'
+                        | '^'
+                        | '_'
+                        | '`'
+                        | '{'
+                        | '|'
+                        | '}'
+                        | '~',
+                )
+        )
+    }
+
+    fn is_domain_atom(&self, is_start: bool) -> bool {
+        matches!(
+            self,
+            TokenType::Alphabetic(_)
+                | TokenType::Integer(_)
+                | TokenType::Alphanumeric(_)
+                | TokenType::Hexadecimal(_)
+                | TokenType::Other(_)
+        ) || (!is_start && matches!(self, TokenType::Punctuation('-')))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::PublicSuffix;
+
+    use super::{TokenType, TypesTokenizer};
+
+    #[test]
+    fn type_tokenizer() {
+        let mut suffixes = PublicSuffix::default();
+        suffixes.suffixes.insert("com".to_string());
+        suffixes.suffixes.insert("co".to_string());
+        suffixes.suffixes.insert("org".to_string());
+
+        // Credits: test suite from linkify crate
+        for (text, expected) in [
+            ("", vec![]),
+            ("foo", vec![TokenType::Alphabetic("foo")]),
+            (":", vec![TokenType::Punctuation(':')]),
+            (
+                "://",
+                vec![
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                ],
+            ),
+            (
+                ":::",
+                vec![
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation(':'),
+                ],
+            ),
+            (
+                "://foo",
+                vec![
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("foo"),
+                ],
+            ),
+            (
+                "1://foo",
+                vec![
+                    TokenType::Integer("1"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("foo"),
+                ],
+            ),
+            (
+                "123://foo",
+                vec![
+                    TokenType::Integer("123"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("foo"),
+                ],
+            ),
+            (
+                "+://foo",
+                vec![
+                    TokenType::Punctuation('+'),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("foo"),
+                ],
+            ),
+            (
+                "-://foo",
+                vec![
+                    TokenType::Punctuation('-'),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("foo"),
+                ],
+            ),
+            (
+                ".://foo",
+                vec![
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("foo"),
+                ],
+            ),
+            ("1abc://foo", vec![TokenType::UrlNoHost("1abc://foo")]),
+            ("a://foo", vec![TokenType::UrlNoHost("a://foo")]),
+            ("a123://foo", vec![TokenType::UrlNoHost("a123://foo")]),
+            ("a123b://foo", vec![TokenType::UrlNoHost("a123b://foo")]),
+            ("a+b://foo", vec![TokenType::UrlNoHost("a+b://foo")]),
+            (
+                "a-b://foo",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('-'),
+                    TokenType::UrlNoHost("b://foo"),
+                ],
+            ),
+            (
+                "a.b://foo",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('.'),
+                    TokenType::UrlNoHost("b://foo"),
+                ],
+            ),
+            ("ABC://foo", vec![TokenType::UrlNoHost("ABC://foo")]),
+            (
+                ".http://example.org/",
+                vec![
+                    TokenType::Punctuation('.'),
+                    TokenType::Url("http://example.org/"),
+                ],
+            ),
+            (
+                "1.http://example.org/",
+                vec![
+                    TokenType::Integer("1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Url("http://example.org/"),
+                ],
+            ),
+            (
+                "ab://",
+                vec![
+                    TokenType::Hexadecimal("ab"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                ],
+            ),
+            (
+                "file://",
+                vec![
+                    TokenType::Alphabetic("file"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                ],
+            ),
+            (
+                "file:// ",
+                vec![
+                    TokenType::Alphabetic("file"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Space,
+                ],
+            ),
+            (
+                "\"file://\"",
+                vec![
+                    TokenType::Punctuation('"'),
+                    TokenType::Alphabetic("file"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('"'),
+                ],
+            ),
+            (
+                "\"file://...\", ",
+                vec![
+                    TokenType::Punctuation('"'),
+                    TokenType::Alphabetic("file"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation(','),
+                    TokenType::Space,
+                ],
+            ),
+            (
+                "file://somefile",
+                vec![TokenType::UrlNoHost("file://somefile")],
+            ),
+            (
+                "file://../relative",
+                vec![TokenType::UrlNoHost("file://../relative")],
+            ),
+            (
+                "http://a.",
+                vec![
+                    TokenType::UrlNoHost("http://a"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            ("http://127.0.0.1", vec![TokenType::Url("http://127.0.0.1")]),
+            (
+                "http://127.0.0.1/",
+                vec![TokenType::Url("http://127.0.0.1/")],
+            ),
+            ("ab://c", vec![TokenType::UrlNoHost("ab://c")]),
+            (
+                "http://example.org/",
+                vec![TokenType::Url("http://example.org/")],
+            ),
+            (
+                "http://example.org/123",
+                vec![TokenType::Url("http://example.org/123")],
+            ),
+            (
+                "http://example.org/?foo=test&bar=123",
+                vec![TokenType::Url("http://example.org/?foo=test&bar=123")],
+            ),
+            (
+                "http://example.org/?foo=%20",
+                vec![TokenType::Url("http://example.org/?foo=%20")],
+            ),
+            (
+                "http://example.org/%3C",
+                vec![TokenType::Url("http://example.org/%3C")],
+            ),
+            ("example.org/", vec![TokenType::UrlNoScheme("example.org/")]),
+            (
+                "example.org/123",
+                vec![TokenType::UrlNoScheme("example.org/123")],
+            ),
+            (
+                "example.org/?foo=test&bar=123",
+                vec![TokenType::UrlNoScheme("example.org/?foo=test&bar=123")],
+            ),
+            (
+                "example.org/?foo=%20",
+                vec![TokenType::UrlNoScheme("example.org/?foo=%20")],
+            ),
+            (
+                "example.org/%3C",
+                vec![TokenType::UrlNoScheme("example.org/%3C")],
+            ),
+            (
+                "foo http://example.org/",
+                vec![
+                    TokenType::Alphabetic("foo"),
+                    TokenType::Space,
+                    TokenType::Url("http://example.org/"),
+                ],
+            ),
+            (
+                "http://example.org/ bar",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "http://example.org/\tbar",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "http://example.org/\nbar",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "http://example.org/\u{b}bar",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('\u{b}'),
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "http://example.org/\u{c}bar",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "http://example.org/\rbar",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "foo example.org/",
+                vec![
+                    TokenType::Alphabetic("foo"),
+                    TokenType::Space,
+                    TokenType::UrlNoScheme("example.org/"),
+                ],
+            ),
+            (
+                "example.org/ bar",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "example.org/\tbar",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "example.org/\nbar",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "example.org/\u{b}bar",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('\u{b}'),
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "example.org/\u{c}bar",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "example.org/\rbar",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "http://example.org/<",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('<'),
+                ],
+            ),
+            (
+                "http://example.org/>",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org/<>",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org/\0",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('\0'),
+                ],
+            ),
+            (
+                "http://example.org/\u{e}",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('\u{e}'),
+                ],
+            ),
+            (
+                "http://example.org/\u{7f}",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('\u{7f}'),
+                ],
+            ),
+            (
+                "http://example.org/\u{9f}",
+                vec![TokenType::Url("http://example.org/\u{9f}")],
+            ),
+            (
+                "http://example.org/foo|bar",
+                vec![
+                    TokenType::Url("http://example.org/foo"),
+                    TokenType::Punctuation('|'),
+                    TokenType::Alphabetic("bar"),
+                ],
+            ),
+            (
+                "example.org/<",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('<'),
+                ],
+            ),
+            (
+                "example.org/>",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org/<>",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org/\0",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('\0'),
+                ],
+            ),
+            (
+                "example.org/\u{e}",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('\u{e}'),
+                ],
+            ),
+            (
+                "example.org/\u{7f}",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('\u{7f}'),
+                ],
+            ),
+            (
+                "example.org/\u{9f}",
+                vec![TokenType::UrlNoScheme("example.org/\u{9f}")],
+            ),
+            (
+                "http://example.org/.",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "http://example.org/..",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "http://example.org/,",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(','),
+                ],
+            ),
+            (
+                "http://example.org/:",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(':'),
+                ],
+            ),
+            (
+                "http://example.org/?",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('?'),
+                ],
+            ),
+            (
+                "http://example.org/!",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('!'),
+                ],
+            ),
+            (
+                "http://example.org/;",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(';'),
+                ],
+            ),
+            (
+                "example.org/.",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "example.org/..",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "example.org/,",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(','),
+                ],
+            ),
+            (
+                "example.org/:",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(':'),
+                ],
+            ),
+            (
+                "example.org/?",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('?'),
+                ],
+            ),
+            (
+                "example.org/!",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('!'),
+                ],
+            ),
+            (
+                "example.org/;",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(';'),
+                ],
+            ),
+            (
+                "http://example.org/a(b)",
+                vec![TokenType::Url("http://example.org/a(b)")],
+            ),
+            (
+                "http://example.org/a[b]",
+                vec![TokenType::Url("http://example.org/a[b]")],
+            ),
+            (
+                "http://example.org/a{b}",
+                vec![TokenType::Url("http://example.org/a{b}")],
+            ),
+            (
+                "http://example.org/a'b'",
+                vec![TokenType::Url("http://example.org/a'b'")],
+            ),
+            (
+                "(http://example.org/)",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "[http://example.org/]",
+                vec![
+                    TokenType::Punctuation('['),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(']'),
+                ],
+            ),
+            (
+                "{http://example.org/}",
+                vec![
+                    TokenType::Punctuation('{'),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('}'),
+                ],
+            ),
+            (
+                "\"http://example.org/\"",
+                vec![
+                    TokenType::Punctuation('"'),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('"'),
+                ],
+            ),
+            (
+                "'http://example.org/'",
+                vec![
+                    TokenType::Punctuation('\''),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('\''),
+                ],
+            ),
+            (
+                "example.org/a(b)",
+                vec![TokenType::UrlNoScheme("example.org/a(b)")],
+            ),
+            (
+                "example.org/a[b]",
+                vec![TokenType::UrlNoScheme("example.org/a[b]")],
+            ),
+            (
+                "example.org/a{b}",
+                vec![TokenType::UrlNoScheme("example.org/a{b}")],
+            ),
+            (
+                "example.org/a'b'",
+                vec![TokenType::UrlNoScheme("example.org/a'b'")],
+            ),
+            (
+                "(example.org/)",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "[example.org/]",
+                vec![
+                    TokenType::Punctuation('['),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(']'),
+                ],
+            ),
+            (
+                "{example.org/}",
+                vec![
+                    TokenType::Punctuation('{'),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('}'),
+                ],
+            ),
+            (
+                "\"example.org/\"",
+                vec![
+                    TokenType::Punctuation('"'),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('"'),
+                ],
+            ),
+            (
+                "'example.org/'",
+                vec![
+                    TokenType::Punctuation('\''),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('\''),
+                ],
+            ),
+            (
+                "((http://example.org/))",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "((http://example.org/a(b)))",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://example.org/a(b)"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "[(http://example.org/)]",
+                vec![
+                    TokenType::Punctuation('['),
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(']'),
+                ],
+            ),
+            (
+                "(http://example.org/).",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "(http://example.org/.)",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "http://example.org/>",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org/(",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('('),
+                ],
+            ),
+            (
+                "http://example.org/(.",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('('),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "http://example.org/]()",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation(']'),
+                    TokenType::Punctuation('('),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "((example.org/))",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "((example.org/a(b)))",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("example.org/a(b)"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "[(example.org/)]",
+                vec![
+                    TokenType::Punctuation('['),
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(']'),
+                ],
+            ),
+            (
+                "(example.org/).",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "(example.org/.)",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "example.org/>",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org/(",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('('),
+                ],
+            ),
+            (
+                "example.org/(.",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('('),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "example.org/]()",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation(']'),
+                    TokenType::Punctuation('('),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "'https://example.org'",
+                vec![
+                    TokenType::Punctuation('\''),
+                    TokenType::Url("https://example.org"),
+                    TokenType::Punctuation('\''),
+                ],
+            ),
+            (
+                "\"https://example.org\"",
+                vec![
+                    TokenType::Punctuation('"'),
+                    TokenType::Url("https://example.org"),
+                    TokenType::Punctuation('"'),
+                ],
+            ),
+            (
+                "''https://example.org''",
+                vec![
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('\''),
+                    TokenType::Url("https://example.org"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('\''),
+                ],
+            ),
+            (
+                "'https://example.org''",
+                vec![
+                    TokenType::Punctuation('\''),
+                    TokenType::Url("https://example.org"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('\''),
+                ],
+            ),
+            (
+                "'https://example.org",
+                vec![
+                    TokenType::Punctuation('\''),
+                    TokenType::Url("https://example.org"),
+                ],
+            ),
+            (
+                "http://example.org/'_(foo)",
+                vec![TokenType::Url("http://example.org/'_(foo)")],
+            ),
+            (
+                "http://example.org/'_(foo)'",
+                vec![TokenType::Url("http://example.org/'_(foo)'")],
+            ),
+            (
+                "http://example.org/''",
+                vec![TokenType::Url("http://example.org/''")],
+            ),
+            (
+                "http://example.org/'''",
+                vec![
+                    TokenType::Url("http://example.org/''"),
+                    TokenType::Punctuation('\''),
+                ],
+            ),
+            (
+                "http://example.org/'.",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "http://example.org/'a",
+                vec![TokenType::Url("http://example.org/'a")],
+            ),
+            (
+                "http://example.org/it's",
+                vec![TokenType::Url("http://example.org/it's")],
+            ),
+            (
+                "example.org/'_(foo)",
+                vec![TokenType::UrlNoScheme("example.org/'_(foo)")],
+            ),
+            (
+                "example.org/'_(foo)'",
+                vec![TokenType::UrlNoScheme("example.org/'_(foo)'")],
+            ),
+            (
+                "example.org/''",
+                vec![TokenType::UrlNoScheme("example.org/''")],
+            ),
+            (
+                "example.org/'''",
+                vec![
+                    TokenType::UrlNoScheme("example.org/''"),
+                    TokenType::Punctuation('\''),
+                ],
+            ),
+            (
+                "example.org/'.",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "example.org/'a",
+                vec![TokenType::UrlNoScheme("example.org/'a")],
+            ),
+            (
+                "example.org/it's",
+                vec![TokenType::UrlNoScheme("example.org/it's")],
+            ),
+            (
+                "http://example.org/\"a",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Hexadecimal("a"),
+                ],
+            ),
+            (
+                "http://example.org/\"a\"",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('"'),
+                ],
+            ),
+            (
+                "http://example.org/`a",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('`'),
+                    TokenType::Hexadecimal("a"),
+                ],
+            ),
+            (
+                "http://example.org/`a`",
+                vec![
+                    TokenType::Url("http://example.org/"),
+                    TokenType::Punctuation('`'),
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('`'),
+                ],
+            ),
+            (
+                "https://example.org*",
+                vec![
+                    TokenType::Url("https://example.org"),
+                    TokenType::Punctuation('*'),
+                ],
+            ),
+            (
+                "https://example.org/*",
+                vec![
+                    TokenType::Url("https://example.org/"),
+                    TokenType::Punctuation('*'),
+                ],
+            ),
+            (
+                "https://example.org/**",
+                vec![
+                    TokenType::Url("https://example.org/"),
+                    TokenType::Punctuation('*'),
+                    TokenType::Punctuation('*'),
+                ],
+            ),
+            (
+                "https://example.org/*/a",
+                vec![TokenType::Url("https://example.org/*/a")],
+            ),
+            (
+                "example.org/`a",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('`'),
+                    TokenType::Hexadecimal("a"),
+                ],
+            ),
+            (
+                "example.org/`a`",
+                vec![
+                    TokenType::UrlNoScheme("example.org/"),
+                    TokenType::Punctuation('`'),
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('`'),
+                ],
+            ),
+            (
+                "http://example.org\">",
+                vec![
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org'>",
+                vec![
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org\"/>",
+                vec![
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org'/>",
+                vec![
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org<p>",
+                vec![
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('<'),
+                    TokenType::Alphabetic("p"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org</p>",
+                vec![
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("p"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org\">",
+                vec![
+                    TokenType::UrlNoScheme("example.org"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org'>",
+                vec![
+                    TokenType::UrlNoScheme("example.org"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org\"/>",
+                vec![
+                    TokenType::UrlNoScheme("example.org"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org'/>",
+                vec![
+                    TokenType::UrlNoScheme("example.org"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org<p>",
+                vec![
+                    TokenType::UrlNoScheme("example.org"),
+                    TokenType::Punctuation('<'),
+                    TokenType::Alphabetic("p"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org</p>",
+                vec![
+                    TokenType::UrlNoScheme("example.org"),
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("p"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "http://example.org\");",
+                vec![
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(';'),
+                ],
+            ),
+            (
+                "http://example.org');",
+                vec![
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(';'),
+                ],
+            ),
+            (
+                "<img src=\"http://example.org/test.svg\">",
+                vec![
+                    TokenType::Punctuation('<'),
+                    TokenType::Alphabetic("img"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("src"),
+                    TokenType::Punctuation('='),
+                    TokenType::Punctuation('"'),
+                    TokenType::Url("http://example.org/test.svg"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "<div><a href=\"http://example.org\"></a></div>",
+                vec![
+                    TokenType::Punctuation('<'),
+                    TokenType::Alphabetic("div"),
+                    TokenType::Punctuation('>'),
+                    TokenType::Punctuation('<'),
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("href"),
+                    TokenType::Punctuation('='),
+                    TokenType::Punctuation('"'),
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation('>'),
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('>'),
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("div"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "<div><a href=\"http://example.org\"\n        ></a></div>",
+                vec![
+                    TokenType::Punctuation('<'),
+                    TokenType::Alphabetic("div"),
+                    TokenType::Punctuation('>'),
+                    TokenType::Punctuation('<'),
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("href"),
+                    TokenType::Punctuation('='),
+                    TokenType::Punctuation('"'),
+                    TokenType::Url("http://example.org"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Space,
+                    TokenType::Punctuation('>'),
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('>'),
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("div"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "<div>\n       <img\n         src=\"http://example.org/test3.jpg\" />\n     </div>",
+                vec![
+                    TokenType::Punctuation('<'),
+                    TokenType::Alphabetic("div"),
+                    TokenType::Punctuation('>'),
+                    TokenType::Space,
+                    TokenType::Punctuation('<'),
+                    TokenType::Alphabetic("img"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("src"),
+                    TokenType::Punctuation('='),
+                    TokenType::Punctuation('"'),
+                    TokenType::Url("http://example.org/test3.jpg"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Space,
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('>'),
+                    TokenType::Space,
+                    TokenType::Punctuation('<'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Alphabetic("div"),
+                    TokenType::Punctuation('>'),
+                ],
+            ),
+            (
+                "example.org\");",
+                vec![
+                    TokenType::UrlNoScheme("example.org"),
+                    TokenType::Punctuation('"'),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(';'),
+                ],
+            ),
+            (
+                "example.org');",
+                vec![
+                    TokenType::UrlNoScheme("example.org"),
+                    TokenType::Punctuation('\''),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation(';'),
+                ],
+            ),
+            (
+                "http://example.org/",
+                vec![TokenType::Url("http://example.org/")],
+            ),
+            (
+                "http://example.org/a/",
+                vec![TokenType::Url("http://example.org/a/")],
+            ),
+            (
+                "http://example.org//",
+                vec![TokenType::Url("http://example.org//")],
+            ),
+            ("example.org/", vec![TokenType::UrlNoScheme("example.org/")]),
+            (
+                "example.org/a/",
+                vec![TokenType::UrlNoScheme("example.org/a/")],
+            ),
+            (
+                "example.org//",
+                vec![TokenType::UrlNoScheme("example.org//")],
+            ),
+            (
+                "http://one.org/ http://two.org/",
+                vec![
+                    TokenType::Url("http://one.org/"),
+                    TokenType::Space,
+                    TokenType::Url("http://two.org/"),
+                ],
+            ),
+            (
+                "http://one.org/ : http://two.org/",
+                vec![
+                    TokenType::Url("http://one.org/"),
+                    TokenType::Space,
+                    TokenType::Punctuation(':'),
+                    TokenType::Space,
+                    TokenType::Url("http://two.org/"),
+                ],
+            ),
+            (
+                "(http://one.org/)(http://two.org/)",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://one.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://two.org/"),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "one.org/ two.org/",
+                vec![
+                    TokenType::UrlNoScheme("one.org/"),
+                    TokenType::Space,
+                    TokenType::UrlNoScheme("two.org/"),
+                ],
+            ),
+            (
+                "one.org/ : two.org/",
+                vec![
+                    TokenType::UrlNoScheme("one.org/"),
+                    TokenType::Space,
+                    TokenType::Punctuation(':'),
+                    TokenType::Space,
+                    TokenType::UrlNoScheme("two.org/"),
+                ],
+            ),
+            (
+                "(one.org/)(two.org/)",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("one.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("two.org/"),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "http://one.org/ two.org/",
+                vec![
+                    TokenType::Url("http://one.org/"),
+                    TokenType::Space,
+                    TokenType::UrlNoScheme("two.org/"),
+                ],
+            ),
+            (
+                "one.org/ : http://two.org/",
+                vec![
+                    TokenType::UrlNoScheme("one.org/"),
+                    TokenType::Space,
+                    TokenType::Punctuation(':'),
+                    TokenType::Space,
+                    TokenType::Url("http://two.org/"),
+                ],
+            ),
+            (
+                "(http://one.org/)(two.org/)",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Url("http://one.org/"),
+                    TokenType::Punctuation(')'),
+                    TokenType::Punctuation('('),
+                    TokenType::UrlNoScheme("two.org/"),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "http://üñîçøðé.com",
+                vec![TokenType::Url("http://üñîçøðé.com")],
+            ),
+            (
+                "http://üñîçøðé.com/ä",
+                vec![TokenType::Url("http://üñîçøðé.com/ä")],
+            ),
+            (
+                "http://example.org/¡",
+                vec![TokenType::Url("http://example.org/¡")],
+            ),
+            (
+                "http://example.org/¢",
+                vec![TokenType::Url("http://example.org/¢")],
+            ),
+            (
+                "http://example.org/😀",
+                vec![TokenType::Url("http://example.org/😀")],
+            ),
+            (
+                "http://example.org/¢/",
+                vec![TokenType::Url("http://example.org/¢/")],
+            ),
+            (
+                "http://xn--c1h.example.com/",
+                vec![TokenType::Url("http://xn--c1h.example.com/")],
+            ),
+            ("üñîçøðé.com", vec![TokenType::UrlNoScheme("üñîçøðé.com")]),
+            (
+                "üñîçøðé.com/ä",
+                vec![TokenType::UrlNoScheme("üñîçøðé.com/ä")],
+            ),
+            (
+                "example.org/¡",
+                vec![TokenType::UrlNoScheme("example.org/¡")],
+            ),
+            (
+                "example.org/¢",
+                vec![TokenType::UrlNoScheme("example.org/¢")],
+            ),
+            (
+                "example.org/😀",
+                vec![TokenType::UrlNoScheme("example.org/😀")],
+            ),
+            (
+                "example.org/¢/",
+                vec![TokenType::UrlNoScheme("example.org/¢/")],
+            ),
+            (
+                "xn--c1h.example.com/",
+                vec![TokenType::UrlNoScheme("xn--c1h.example.com/")],
+            ),
+            (
+                "example.",
+                vec![
+                    TokenType::Alphabetic("example"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "example./",
+                vec![
+                    TokenType::Alphabetic("example"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('/'),
+                ],
+            ),
+            (
+                "foo.com.",
+                vec![
+                    TokenType::UrlNoScheme("foo.com"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "example.c",
+                vec![
+                    TokenType::Alphabetic("example"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Hexadecimal("c"),
+                ],
+            ),
+            ("example.co", vec![TokenType::UrlNoScheme("example.co")]),
+            ("example.com", vec![TokenType::UrlNoScheme("example.com")]),
+            ("e.com", vec![TokenType::UrlNoScheme("e.com")]),
+            (
+                "exampl.e.c",
+                vec![
+                    TokenType::Alphabetic("exampl"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Hexadecimal("e"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Hexadecimal("c"),
+                ],
+            ),
+            ("exampl.e.co", vec![TokenType::UrlNoScheme("exampl.e.co")]),
+            (
+                "e.xample.c",
+                vec![
+                    TokenType::Hexadecimal("e"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Alphabetic("xample"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Hexadecimal("c"),
+                ],
+            ),
+            ("e.xample.co", vec![TokenType::UrlNoScheme("e.xample.co")]),
+            (
+                "v1.1.1",
+                vec![
+                    TokenType::Alphanumeric("v1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("1"),
+                ],
+            ),
+            (
+                "foo.bar@example.org",
+                vec![TokenType::Email("foo.bar@example.org")],
+            ),
+            (
+                "example.com@example.com",
+                vec![TokenType::Email("example.com@example.com")],
+            ),
+            (
+                "Look, no scheme: example.org/foo email@foo.com",
+                vec![
+                    TokenType::Alphabetic("Look"),
+                    TokenType::Punctuation(','),
+                    TokenType::Space,
+                    TokenType::Alphabetic("no"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("scheme"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Space,
+                    TokenType::UrlNoScheme("example.org/foo"),
+                    TokenType::Space,
+                    TokenType::Email("email@foo.com"),
+                ],
+            ),
+            (
+                "Web:\nwww.foobar.co\nE-Mail:\n      bar@foobar.co (bla bla bla)",
+                vec![
+                    TokenType::Alphabetic("Web"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Space,
+                    TokenType::UrlNoScheme("www.foobar.co"),
+                    TokenType::Space,
+                    TokenType::Hexadecimal("E"),
+                    TokenType::Punctuation('-'),
+                    TokenType::Alphabetic("Mail"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Space,
+                    TokenType::Email("bar@foobar.co"),
+                    TokenType::Space,
+                    TokenType::Punctuation('('),
+                    TokenType::Alphabetic("bla"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bla"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("bla"),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "upi://pay?pa=XXXXXXX&pn=XXXXX",
+                vec![TokenType::UrlNoHost("upi://pay?pa=XXXXXXX&pn=XXXXX")],
+            ),
+            (
+                "https://example.org?pa=XXXXXXX&pn=XXXXX",
+                vec![TokenType::Url("https://example.org?pa=XXXXXXX&pn=XXXXX")],
+            ),
+            (
+                "website https://domain.com",
+                vec![
+                    TokenType::Alphabetic("website"),
+                    TokenType::Space,
+                    TokenType::Url("https://domain.com"),
+                ],
+            ),
+            ("a12.b-c.com", vec![TokenType::UrlNoScheme("a12.b-c.com")]),
+            (
+                "v1.2.3",
+                vec![
+                    TokenType::Alphanumeric("v1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("2"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("3"),
+                ],
+            ),
+            (
+                "https://12-7.0.0.1/",
+                vec![TokenType::UrlNoHost("https://12-7.0.0.1/")],
+            ),
+            (
+                "https://user:pass@example.com/",
+                vec![TokenType::Url("https://user:pass@example.com/")],
+            ),
+            (
+                "https://user:-.!$@example.com/",
+                vec![TokenType::Url("https://user:-.!$@example.com/")],
+            ),
+            (
+                "https://user:!$&'()*+,;=@example.com/",
+                vec![TokenType::Url("https://user:!$&'()*+,;=@example.com/")],
+            ),
+            (
+                "https://user:pass@ex@mple.com/",
+                vec![
+                    TokenType::UrlNoHost("https://user:pass@ex"),
+                    TokenType::Punctuation('@'),
+                    TokenType::UrlNoScheme("mple.com/"),
+                ],
+            ),
+            (
+                "https://localhost:8080!",
+                vec![
+                    TokenType::UrlNoHost("https://localhost:8080"),
+                    TokenType::Punctuation('!'),
+                ],
+            ),
+            (
+                "https://localhost:8080/",
+                vec![TokenType::UrlNoHost("https://localhost:8080/")],
+            ),
+            (
+                "https://user:pass@example.com:8080/hi",
+                vec![TokenType::Url("https://user:pass@example.com:8080/hi")],
+            ),
+            (
+                "https://127.0.0.1/",
+                vec![TokenType::Url("https://127.0.0.1/")],
+            ),
+            ("1.0.0.0", vec![TokenType::UrlNoScheme("1.0.0.0")]),
+            (
+                "1.0.0.0/foo/bar",
+                vec![TokenType::UrlNoScheme("1.0.0.0/foo/bar")],
+            ),
+            ("1.0 ", vec![TokenType::Float("1.0"), TokenType::Space]),
+            (
+                "1.0.0",
+                vec![
+                    TokenType::Integer("1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("0"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("0"),
+                ],
+            ),
+            (
+                "1.0.0.0.0",
+                vec![
+                    TokenType::Integer("1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::UrlNoScheme("0.0.0.0"),
+                ],
+            ),
+            (
+                "1.0.0.",
+                vec![
+                    TokenType::Integer("1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("0"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("0"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "https://example.com.:8080/test",
+                vec![TokenType::Url("https://example.com.:8080/test")],
+            ),
+            (
+                "https://example.org'",
+                vec![
+                    TokenType::Url("https://example.org"),
+                    TokenType::Punctuation('\''),
+                ],
+            ),
+            (
+                "https://example.org'a@example.com",
+                vec![TokenType::Url("https://example.org'a@example.com")],
+            ),
+            (
+                "https://a.com'https://b.com",
+                vec![
+                    TokenType::UrlNoHost("https://a.com'https"),
+                    TokenType::Punctuation(':'),
+                    TokenType::Punctuation('/'),
+                    TokenType::Punctuation('/'),
+                    TokenType::UrlNoScheme("b.com"),
+                ],
+            ),
+            (
+                "https://example.com...",
+                vec![
+                    TokenType::Url("https://example.com"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "www.example..com",
+                vec![
+                    TokenType::Alphabetic("www"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Alphabetic("example"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Alphabetic("com"),
+                ],
+            ),
+            (
+                "https://.www.example.com",
+                vec![TokenType::Url("https://.www.example.com")],
+            ),
+            (
+                "-a.com",
+                vec![TokenType::Punctuation('-'), TokenType::UrlNoScheme("a.com")],
+            ),
+            ("https://a.-b.com", vec![TokenType::Url("https://a.-b.com")]),
+            (
+                "a-.com",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('-'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Alphabetic("com"),
+                ],
+            ),
+            (
+                "a.b-.com",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Hexadecimal("b"),
+                    TokenType::Punctuation('-'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Alphabetic("com"),
+                ],
+            ),
+            ("https://a.b-.com", vec![TokenType::Url("https://a.b-.com")]),
+            (
+                "https://example.com-/",
+                vec![
+                    TokenType::Url("https://example.com"),
+                    TokenType::Punctuation('-'),
+                    TokenType::Punctuation('/'),
+                ],
+            ),
+            (
+                "https://example.org-",
+                vec![
+                    TokenType::Url("https://example.org"),
+                    TokenType::Punctuation('-'),
+                ],
+            ),
+            (
+                "example.com@about",
+                vec![
+                    TokenType::UrlNoScheme("example.com"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Alphabetic("about"),
+                ],
+            ),
+            (
+                "example.com/@about",
+                vec![TokenType::UrlNoScheme("example.com/@about")],
+            ),
+            (
+                "https://example.com/@about",
+                vec![TokenType::Url("https://example.com/@about")],
+            ),
+            (
+                "info@v1.1.1",
+                vec![
+                    TokenType::Alphabetic("info"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Alphanumeric("v1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("1"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Integer("1"),
+                ],
+            ),
+            ("file:///", vec![TokenType::UrlNoHost("file:///")]),
+            (
+                "file:///home/foo",
+                vec![TokenType::UrlNoHost("file:///home/foo")],
+            ),
+            (
+                "file://localhost/home/foo",
+                vec![TokenType::UrlNoHost("file://localhost/home/foo")],
+            ),
+            (
+                "facetime://+19995551234",
+                vec![TokenType::UrlNoHost("facetime://+19995551234")],
+            ),
+            (
+                "test://123'456!!!",
+                vec![
+                    TokenType::UrlNoHost("test://123'456"),
+                    TokenType::Punctuation('!'),
+                    TokenType::Punctuation('!'),
+                    TokenType::Punctuation('!'),
+                ],
+            ),
+            (
+                "test://123'456...",
+                vec![
+                    TokenType::UrlNoHost("test://123'456"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "test://123'456!!!/",
+                vec![
+                    TokenType::UrlNoHost("test://123'456"),
+                    TokenType::Punctuation('!'),
+                    TokenType::Punctuation('!'),
+                    TokenType::Punctuation('!'),
+                    TokenType::Punctuation('/'),
+                ],
+            ),
+            (
+                "test://123'456.../",
+                vec![
+                    TokenType::UrlNoHost("test://123'456"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('/'),
+                ],
+            ),
+            (
+                "1abc://example.com",
+                vec![TokenType::Url("1abc://example.com")],
+            ),
+            (
+                "¡¢example.com",
+                vec![TokenType::UrlNoScheme("¡¢example.com")],
+            ),
+            ("foo", vec![TokenType::Alphabetic("foo")]),
+            ("@", vec![TokenType::Punctuation('@')]),
+            (
+                "a@",
+                vec![TokenType::Hexadecimal("a"), TokenType::Punctuation('@')],
+            ),
+            (
+                "@a",
+                vec![TokenType::Punctuation('@'), TokenType::Hexadecimal("a")],
+            ),
+            (
+                "@@@",
+                vec![
+                    TokenType::Punctuation('@'),
+                    TokenType::Punctuation('@'),
+                    TokenType::Punctuation('@'),
+                ],
+            ),
+            ("foo@example.com", vec![TokenType::Email("foo@example.com")]),
+            (
+                "foo.bar@example.com",
+                vec![TokenType::Email("foo.bar@example.com")],
+            ),
+            (
+                "#!$%&'*+-/=?^_`{}|~@example.org",
+                vec![TokenType::Email("#!$%&'*+-/=?^_`{}|~@example.org")],
+            ),
+            (
+                "foo a@b.com",
+                vec![
+                    TokenType::Alphabetic("foo"),
+                    TokenType::Space,
+                    TokenType::Email("a@b.com"),
+                ],
+            ),
+            (
+                "a@b.com foo",
+                vec![
+                    TokenType::Email("a@b.com"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("foo"),
+                ],
+            ),
+            (
+                "\na@b.com",
+                vec![TokenType::Space, TokenType::Email("a@b.com")],
+            ),
+            (
+                "a@b.com\n",
+                vec![TokenType::Email("a@b.com"), TokenType::Space],
+            ),
+            (
+                "(a@example.com)",
+                vec![
+                    TokenType::Punctuation('('),
+                    TokenType::Email("a@example.com"),
+                    TokenType::Punctuation(')'),
+                ],
+            ),
+            (
+                "\"a@example.com\"",
+                vec![
+                    TokenType::Punctuation('"'),
+                    TokenType::Email("a@example.com"),
+                    TokenType::Punctuation('"'),
+                ],
+            ),
+            (
+                "\"a@example.com\"",
+                vec![
+                    TokenType::Punctuation('"'),
+                    TokenType::Email("a@example.com"),
+                    TokenType::Punctuation('"'),
+                ],
+            ),
+            (
+                ",a@example.com,",
+                vec![
+                    TokenType::Punctuation(','),
+                    TokenType::Email("a@example.com"),
+                    TokenType::Punctuation(','),
+                ],
+            ),
+            (
+                ":a@example.com:",
+                vec![
+                    TokenType::Punctuation(':'),
+                    TokenType::Email("a@example.com"),
+                    TokenType::Punctuation(':'),
+                ],
+            ),
+            (
+                ";a@example.com;",
+                vec![
+                    TokenType::Punctuation(';'),
+                    TokenType::Email("a@example.com"),
+                    TokenType::Punctuation(';'),
+                ],
+            ),
+            (
+                ".@example.com",
+                vec![
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('@'),
+                    TokenType::UrlNoScheme("example.com"),
+                ],
+            ),
+            (
+                "foo.@example.com",
+                vec![
+                    TokenType::Alphabetic("foo"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('@'),
+                    TokenType::UrlNoScheme("example.com"),
+                ],
+            ),
+            (
+                ".foo@example.com",
+                vec![
+                    TokenType::Punctuation('.'),
+                    TokenType::Email("foo@example.com"),
+                ],
+            ),
+            (
+                ".foo@example.com",
+                vec![
+                    TokenType::Punctuation('.'),
+                    TokenType::Email("foo@example.com"),
+                ],
+            ),
+            (
+                "a..b@example.com",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Punctuation('.'),
+                    TokenType::Email("b@example.com"),
+                ],
+            ),
+            (
+                "a@example.com.",
+                vec![
+                    TokenType::Email("a@example.com"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "a@b",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Hexadecimal("b"),
+                ],
+            ),
+            (
+                "a@b.",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Hexadecimal("b"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "a@b.com.",
+                vec![TokenType::Email("a@b.com"), TokenType::Punctuation('.')],
+            ),
+            (
+                "a@example.com-",
+                vec![
+                    TokenType::Email("a@example.com"),
+                    TokenType::Punctuation('-'),
+                ],
+            ),
+            ("a@foo-bar.com", vec![TokenType::Email("a@foo-bar.com")]),
+            (
+                "a@-foo.com",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Punctuation('-'),
+                    TokenType::UrlNoScheme("foo.com"),
+                ],
+            ),
+            (
+                "a@b-.",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Hexadecimal("b"),
+                    TokenType::Punctuation('-'),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "a@b",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Hexadecimal("b"),
+                ],
+            ),
+            (
+                "a@b.",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Hexadecimal("b"),
+                    TokenType::Punctuation('.'),
+                ],
+            ),
+            (
+                "a@example.com b@example.com",
+                vec![
+                    TokenType::Email("a@example.com"),
+                    TokenType::Space,
+                    TokenType::Email("b@example.com"),
+                ],
+            ),
+            (
+                "a@example.com @ b@example.com",
+                vec![
+                    TokenType::Email("a@example.com"),
+                    TokenType::Space,
+                    TokenType::Punctuation('@'),
+                    TokenType::Space,
+                    TokenType::Email("b@example.com"),
+                ],
+            ),
+            (
+                "a@xy.com;b@xy.com,c@xy.com",
+                vec![
+                    TokenType::Email("a@xy.com"),
+                    TokenType::Punctuation(';'),
+                    TokenType::Email("b@xy.com"),
+                    TokenType::Punctuation(','),
+                    TokenType::Email("c@xy.com"),
+                ],
+            ),
+            (
+                "üñîçøðé@example.com",
+                vec![TokenType::Email("üñîçøðé@example.com")],
+            ),
+            (
+                "üñîçøðé@üñîçøðé.com",
+                vec![TokenType::Email("üñîçøðé@üñîçøðé.com")],
+            ),
+            ("www@example.com", vec![TokenType::Email("www@example.com")]),
+            (
+                "a@a.xyϸ",
+                vec![
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('@'),
+                    TokenType::Hexadecimal("a"),
+                    TokenType::Punctuation('.'),
+                    TokenType::Alphabetic("xyϸ"),
+                ],
+            ),
+            (
+                "100 -100 100.00 -100.00 $100 $100.00",
+                vec![
+                    TokenType::Integer("100"),
+                    TokenType::Space,
+                    TokenType::Integer("-100"),
+                    TokenType::Space,
+                    TokenType::Float("100.00"),
+                    TokenType::Space,
+                    TokenType::Float("-100.00"),
+                    TokenType::Space,
+                    TokenType::Punctuation('$'),
+                    TokenType::Integer("100"),
+                    TokenType::Space,
+                    TokenType::Punctuation('$'),
+                    TokenType::Float("100.00"),
+                ],
+            ),
+            (
+                " - 100 100 . 00",
+                vec![
+                    TokenType::Space,
+                    TokenType::Punctuation('-'),
+                    TokenType::Space,
+                    TokenType::Integer("100"),
+                    TokenType::Space,
+                    TokenType::Integer("100"),
+                    TokenType::Space,
+                    TokenType::Punctuation('.'),
+                    TokenType::Space,
+                    TokenType::Integer("00"),
+                ],
+            ),
+            (
+                "send $100.00 to user@domain.com or visit domain.com/pay-me!",
+                vec![
+                    TokenType::Alphabetic("send"),
+                    TokenType::Space,
+                    TokenType::Punctuation('$'),
+                    TokenType::Float("100.00"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("to"),
+                    TokenType::Space,
+                    TokenType::Email("user@domain.com"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("or"),
+                    TokenType::Space,
+                    TokenType::Alphabetic("visit"),
+                    TokenType::Space,
+                    TokenType::UrlNoScheme("domain.com/pay-me"),
+                    TokenType::Punctuation('!'),
+                ],
+            ),
+        ] {
+            let result = TypesTokenizer::new(text, &suffixes)
+                .map(|t| t.word)
+                .collect::<Vec<_>>();
+
+            assert_eq!(result, expected);
+
+            /*print!("({text:?}, ");
+            print!("vec![");
+            for (pos, item) in result.into_iter().enumerate() {
+                if pos > 0 {
+                    print!(", ");
+                }
+                print!("TokenType::{:?}", item);
+            }
+            println!("]),");*/
+        }
+    }
+}
diff --git a/crates/store/src/fts/tokenizers/indo_european.rs b/crates/nlp/src/tokenizers/word.rs
similarity index 94%
rename from crates/store/src/fts/tokenizers/indo_european.rs
rename to crates/nlp/src/tokenizers/word.rs
index e1f34ce6..26854fbf 100644
--- a/crates/store/src/fts/tokenizers/indo_european.rs
+++ b/crates/nlp/src/tokenizers/word.rs
@@ -21,19 +21,19 @@
  * for more details.
 */
 
-use std::str::CharIndices;
+use std::{borrow::Cow, str::CharIndices};
 
 use super::Token;
 
-pub struct IndoEuropeanTokenizer<'x> {
+pub struct WordTokenizer<'x> {
     max_token_length: usize,
     text: &'x str,
     iterator: CharIndices<'x>,
 }
 
-impl<'x> IndoEuropeanTokenizer<'x> {
-    pub fn new(text: &str, max_token_length: usize) -> IndoEuropeanTokenizer {
-        IndoEuropeanTokenizer {
+impl<'x> WordTokenizer<'x> {
+    pub fn new(text: &str, max_token_length: usize) -> WordTokenizer {
+        WordTokenizer {
             max_token_length,
             text,
             iterator: text.char_indices(),
@@ -42,8 +42,8 @@ impl<'x> IndoEuropeanTokenizer<'x> {
 }
 
 /// Parses indo-european text into lowercase tokens.
-impl<'x> Iterator for IndoEuropeanTokenizer<'x> {
-    type Item = Token<'x>;
+impl<'x> Iterator for WordTokenizer<'x> {
+    type Item = Token<Cow<'x, str>>;
 
     fn next(&mut self) -> Option<Self::Item> {
         while let Some((token_start, ch)) = self.iterator.next() {
@@ -159,7 +159,7 @@ mod tests {
         ];
 
         for (input, tokens) in inputs.iter() {
-            for (pos, token) in IndoEuropeanTokenizer::new(input, 40).enumerate() {
+            for (pos, token) in WordTokenizer::new(input, 40).enumerate() {
                 assert_eq!(token, tokens[pos]);
             }
         }
diff --git a/crates/store/src/fts/ngram.rs b/crates/nlp/src/transformers/mod.rs
similarity index 53%
rename from crates/store/src/fts/ngram.rs
rename to crates/nlp/src/transformers/mod.rs
index 2ca2c781..1d2d365b 100644
--- a/crates/store/src/fts/ngram.rs
+++ b/crates/nlp/src/transformers/mod.rs
@@ -21,41 +21,4 @@
  * for more details.
 */
 
-use std::borrow::Cow;
-
-use super::bloom::{BloomFilter, BloomHashGroup};
-
-pub trait ToNgrams: Sized {
-    fn new(items: usize) -> Self;
-    fn insert(&mut self, item: &str);
-    fn to_ngrams(tokens: &[Cow<'_, str>], n: usize) -> Self {
-        let mut filter = Self::new(tokens.len().saturating_sub(1));
-        for words in tokens.windows(n) {
-            filter.insert(&words.join(" "));
-        }
-        filter
-    }
-}
-
-impl ToNgrams for BloomFilter {
-    fn new(items: usize) -> Self {
-        BloomFilter::new(items)
-    }
-
-    fn insert(&mut self, item: &str) {
-        self.insert(&item.into())
-    }
-}
-
-impl ToNgrams for Vec<BloomHashGroup> {
-    fn new(items: usize) -> Self {
-        Vec::with_capacity(items)
-    }
-
-    fn insert(&mut self, item: &str) {
-        self.push(BloomHashGroup {
-            h1: item.into(),
-            h2: None,
-        })
-    }
-}
+pub mod osb;
diff --git a/crates/nlp/src/transformers/osb.rs b/crates/nlp/src/transformers/osb.rs
new file mode 100644
index 00000000..0c87132d
--- /dev/null
+++ b/crates/nlp/src/transformers/osb.rs
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2023 Stalwart Labs Ltd.
+ *
+ * This file is part of the Stalwart Mail Server.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ * in the LICENSE file at the top-level directory of this distribution.
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can be released from the requirements of the AGPLv3 license by
+ * purchasing a commercial license. Please contact licensing@stalw.art
+ * for more details.
+*/
+
+use std::iter::Peekable;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct OsbToken<T> {
+    pub inner: T,
+    pub idx: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Gram<'x> {
+    Uni { t1: &'x str },
+    Bi { t1: &'x str, t2: &'x str },
+}
+
+pub struct OsbTokenizer<'x, I>
+where
+    I: Iterator<Item = &'x str>,
+{
+    iter: Peekable<I>,
+    buf: Vec<Option<&'x str>>,
+    window_size: usize,
+    window_pos: usize,
+    window_idx: usize,
+}
+
+impl<'x, I> OsbTokenizer<'x, I>
+where
+    I: Iterator<Item = &'x str>,
+{
+    pub fn new(iter: I, window_size: usize) -> Self {
+        Self {
+            iter: iter.peekable(),
+            buf: vec![None; window_size],
+            window_pos: 0,
+            window_idx: 0,
+            window_size,
+        }
+    }
+}
+
+impl<'x, I> Iterator for OsbTokenizer<'x, I>
+where
+    I: Iterator<Item = &'x str>,
+{
+    type Item = OsbToken<Gram<'x>>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let end_pos = (self.window_pos + self.window_idx) % self.window_size;
+        if self.buf[end_pos].is_none() {
+            self.buf[end_pos] = self.iter.next();
+        }
+
+        let t1 = self.buf[self.window_pos % self.window_size]?;
+        let token = OsbToken {
+            inner: if self.window_idx != 0 {
+                Gram::Bi {
+                    t1,
+                    t2: self.buf[end_pos]?,
+                }
+            } else {
+                Gram::Uni { t1 }
+            },
+            idx: self.window_idx,
+        };
+
+        // Increment window
+        self.window_idx += 1;
+        if self.window_idx == self.window_size
+            || (self.iter.peek().is_none()
+                && self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none())
+        {
+            self.buf[self.window_pos % self.window_size] = None;
+            self.window_idx = 0;
+            self.window_pos += 1;
+        }
+
+        Some(token)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::transformers::osb::{Gram, OsbToken};
+
+    #[test]
+    fn osb_tokenizer() {
+        assert_eq!(
+            super::OsbTokenizer::new(
+                "The quick brown fox jumps over the lazy dog and the lazy cat"
+                    .split_ascii_whitespace(),
+                5
+            )
+            .collect::<Vec<_>>(),
+            vec![
+                OsbToken {
+                    inner: Gram::Uni { t1: "The" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "The",
+                        t2: "quick"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "The",
+                        t2: "brown"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "The",
+                        t2: "fox"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "The",
+                        t2: "jumps"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "quick" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "quick",
+                        t2: "brown"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "quick",
+                        t2: "fox"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "quick",
+                        t2: "jumps"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "quick",
+                        t2: "over"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "brown" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "brown",
+                        t2: "fox"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "brown",
+                        t2: "jumps"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "brown",
+                        t2: "over"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "brown",
+                        t2: "the"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "fox" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "fox",
+                        t2: "jumps"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "fox",
+                        t2: "over"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "fox",
+                        t2: "the"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "fox",
+                        t2: "lazy"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "jumps" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "jumps",
+                        t2: "over"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "jumps",
+                        t2: "the"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "jumps",
+                        t2: "lazy"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "jumps",
+                        t2: "dog"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "over" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "over",
+                        t2: "the"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "over",
+                        t2: "lazy"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "over",
+                        t2: "dog"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "over",
+                        t2: "and"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "the" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "lazy"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "dog"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "and"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "the"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "lazy" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "dog"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "and"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "the"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "lazy"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "dog" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "dog",
+                        t2: "and"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "dog",
+                        t2: "the"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "dog",
+                        t2: "lazy"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "dog",
+                        t2: "cat"
+                    },
+                    idx: 4
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "and" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "and",
+                        t2: "the"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "and",
+                        t2: "lazy"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "and",
+                        t2: "cat"
+                    },
+                    idx: 3
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "the" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "lazy"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "the",
+                        t2: "cat"
+                    },
+                    idx: 2
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "lazy" },
+                    idx: 0
+                },
+                OsbToken {
+                    inner: Gram::Bi {
+                        t1: "lazy",
+                        t2: "cat"
+                    },
+                    idx: 1
+                },
+                OsbToken {
+                    inner: Gram::Uni { t1: "cat" },
+                    idx: 0
+                }
+            ]
+        );
+    }
+}
diff --git a/crates/store/Cargo.toml b/crates/store/Cargo.toml
index 9c4bb149..5a2dc3f5 100644
--- a/crates/store/Cargo.toml
+++ b/crates/store/Cargo.toml
@@ -6,6 +6,7 @@ resolver = "2"
 
 [dependencies]
 utils = { path = "../utils" }
+nlp = { path = "../nlp" }
 maybe-async = { path = "../maybe-async" }
 rocksdb = { version = "0.20.1", optional = true }
 foundationdb = { version = "0.8.0", features = ["embedded-fdb-include"], optional = true }
@@ -21,13 +22,9 @@ serde = { version = "1.0", features = ["derive"]}
 ahash = { version = "0.8.0", features = ["serde"] }
 bitpacking = "0.8.4"
 lazy_static = "1.4"
-whatlang = "0.16" # Language detection
-rust-stemmers = "1.2" # Stemmers
-tinysegmenter = "0.1" # Japanese tokenizer
-jieba-rs = "0.6" # Chinese stemmer
 xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
 farmhash = "1.1.5"
-siphasher = "0.3"
+siphasher = "1.0"
 parking_lot = "0.12.1"
 lru-cache = { version = "0.1.2", optional = true }
 num_cpus = { version = "1.15.0", optional = true }
diff --git a/crates/store/src/fts/bloom.rs b/crates/store/src/fts/bloom.rs
index 54905458..31e36427 100644
--- a/crates/store/src/fts/bloom.rs
+++ b/crates/store/src/fts/bloom.rs
@@ -27,13 +27,12 @@ use std::{
     hash::{Hash, Hasher},
 };
 
+use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
 use roaring::RoaringBitmap;
 use utils::codec::leb128::{Leb128Reader, Leb128Vec};
 
 use crate::{Deserialize, Error, Serialize};
 
-use super::{stemmer::StemmedToken, tokenizers::Token};
-
 pub struct BloomFilter {
     m: u64,
     b: RoaringBitmap,
@@ -204,8 +203,8 @@ impl From<Cow<'_, str>> for BloomHash {
     }
 }
 
-impl From<Token<'_>> for BloomHashGroup {
-    fn from(t: Token<'_>) -> Self {
+impl From<Token<Cow<'_, str>>> for BloomHashGroup {
+    fn from(t: Token<Cow<'_, str>>) -> Self {
         Self {
             h1: BloomHash::hash(t.word.as_ref()),
             h2: None,
diff --git a/crates/store/src/fts/builder.rs b/crates/store/src/fts/builder.rs
index 3ddf538f..508d1e87 100644
--- a/crates/store/src/fts/builder.rs
+++ b/crates/store/src/fts/builder.rs
@@ -24,6 +24,14 @@
 use std::{borrow::Cow, collections::HashSet};
 
 use ahash::AHashSet;
+use nlp::{
+    language::{
+        detect::{LanguageDetector, MIN_LANGUAGE_SCORE},
+        stemmer::Stemmer,
+        Language,
+    },
+    tokenizers::{space::SpaceTokenizer, Token},
+};
 use utils::map::vec_map::VecMap;
 
 use crate::{
@@ -32,13 +40,7 @@ use crate::{
     Serialize, HASH_EXACT, HASH_STEMMED,
 };
 
-use super::{
-    lang::{LanguageDetector, MIN_LANGUAGE_SCORE},
-    stemmer::Stemmer,
-    term_index::{TermIndexBuilder, TokenIndex},
-    tokenizers::{space::SpaceTokenizer, Token},
-    Language,
-};
+use super::term_index::{TermIndexBuilder, TokenIndex};
 
 pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize;
 pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1;
@@ -138,8 +140,8 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
                 ops.insert(Operation::hash(&token, HASH_EXACT, field, true));
                 terms.push(term_index.add_token(Token {
                     word: token.into(),
-                    offset: 0,
-                    len: 0,
+                    from: 0,
+                    to: 0,
                 }));
             }
             term_index.add_terms(field, 0, terms);
diff --git a/crates/store/src/fts/mod.rs b/crates/store/src/fts/mod.rs
index 3f3d0b9e..8761f076 100644
--- a/crates/store/src/fts/mod.rs
+++ b/crates/store/src/fts/mod.rs
@@ -26,149 +26,13 @@ use crate::{
     BitmapKey, Serialize, BM_HASH,
 };
 
-use self::{bloom::hash_token, builder::MAX_TOKEN_MASK, lang::LanguageDetector};
+use self::{bloom::hash_token, builder::MAX_TOKEN_MASK};
 
-pub mod lang;
-//pub mod pdf;
 pub mod bloom;
 pub mod builder;
-pub mod ngram;
 pub mod query;
 pub mod search_snippet;
-pub mod stemmer;
 pub mod term_index;
-pub mod tokenizers;
-
-#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
-pub enum Language {
-    Esperanto = 0,
-    English = 1,
-    Russian = 2,
-    Mandarin = 3,
-    Spanish = 4,
-    Portuguese = 5,
-    Italian = 6,
-    Bengali = 7,
-    French = 8,
-    German = 9,
-    Ukrainian = 10,
-    Georgian = 11,
-    Arabic = 12,
-    Hindi = 13,
-    Japanese = 14,
-    Hebrew = 15,
-    Yiddish = 16,
-    Polish = 17,
-    Amharic = 18,
-    Javanese = 19,
-    Korean = 20,
-    Bokmal = 21,
-    Danish = 22,
-    Swedish = 23,
-    Finnish = 24,
-    Turkish = 25,
-    Dutch = 26,
-    Hungarian = 27,
-    Czech = 28,
-    Greek = 29,
-    Bulgarian = 30,
-    Belarusian = 31,
-    Marathi = 32,
-    Kannada = 33,
-    Romanian = 34,
-    Slovene = 35,
-    Croatian = 36,
-    Serbian = 37,
-    Macedonian = 38,
-    Lithuanian = 39,
-    Latvian = 40,
-    Estonian = 41,
-    Tamil = 42,
-    Vietnamese = 43,
-    Urdu = 44,
-    Thai = 45,
-    Gujarati = 46,
-    Uzbek = 47,
-    Punjabi = 48,
-    Azerbaijani = 49,
-    Indonesian = 50,
-    Telugu = 51,
-    Persian = 52,
-    Malayalam = 53,
-    Oriya = 54,
-    Burmese = 55,
-    Nepali = 56,
-    Sinhalese = 57,
-    Khmer = 58,
-    Turkmen = 59,
-    Akan = 60,
-    Zulu = 61,
-    Shona = 62,
-    Afrikaans = 63,
-    Latin = 64,
-    Slovak = 65,
-    Catalan = 66,
-    Tagalog = 67,
-    Armenian = 68,
-    Unknown = 69,
-    None = 70,
-}
-
-impl Language {
-    pub fn from_iso_639(code: &str) -> Option<Self> {
-        match code.split_once('-').map(|c| c.0).unwrap_or(code) {
-            "en" => Language::English,
-            "es" => Language::Spanish,
-            "pt" => Language::Portuguese,
-            "it" => Language::Italian,
-            "fr" => Language::French,
-            "de" => Language::German,
-            "ru" => Language::Russian,
-            "zh" => Language::Mandarin,
-            "ja" => Language::Japanese,
-            "ar" => Language::Arabic,
-            "hi" => Language::Hindi,
-            "ko" => Language::Korean,
-            "bn" => Language::Bengali,
-            "he" => Language::Hebrew,
-            "ur" => Language::Urdu,
-            "fa" => Language::Persian,
-            "ml" => Language::Malayalam,
-            "or" => Language::Oriya,
-            "my" => Language::Burmese,
-            "ne" => Language::Nepali,
-            "si" => Language::Sinhalese,
-            "km" => Language::Khmer,
-            "tk" => Language::Turkmen,
-            "am" => Language::Amharic,
-            "az" => Language::Azerbaijani,
-            "id" => Language::Indonesian,
-            "te" => Language::Telugu,
-            "ta" => Language::Tamil,
-            "vi" => Language::Vietnamese,
-            "gu" => Language::Gujarati,
-            "pa" => Language::Punjabi,
-            "uz" => Language::Uzbek,
-            "hy" => Language::Armenian,
-            "ka" => Language::Georgian,
-            "la" => Language::Latin,
-            "sl" => Language::Slovene,
-            "hr" => Language::Croatian,
-            "sr" => Language::Serbian,
-            "mk" => Language::Macedonian,
-            "lt" => Language::Lithuanian,
-            "lv" => Language::Latvian,
-            "et" => Language::Estonian,
-            "tl" => Language::Tagalog,
-            "af" => Language::Afrikaans,
-            "zu" => Language::Zulu,
-            "sn" => Language::Shona,
-            "ak" => Language::Akan,
-            _ => return None,
-        }
-        .into()
-    }
-}
 
 impl BitmapKey<Vec<u8>> {
     pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self {
@@ -209,19 +73,3 @@ impl Operation {
         }
     }
 }
-
-impl Language {
-    pub fn detect(text: String, default: Language) -> (String, Language) {
-        if let Some((l, t)) = text
-            .split_once(':')
-            .and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
-        {
-            (t.to_string(), l)
-        } else {
-            let l = LanguageDetector::detect_single(&text)
-                .and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
-                .unwrap_or(default);
-            (text, l)
-        }
-    }
-}
diff --git a/crates/store/src/fts/query.rs b/crates/store/src/fts/query.rs
index 09439d30..77bc4dbd 100644
--- a/crates/store/src/fts/query.rs
+++ b/crates/store/src/fts/query.rs
@@ -21,14 +21,14 @@
  * for more details.
 */
 
+use nlp::language::{stemmer::Stemmer, Language};
 use roaring::RoaringBitmap;
 
 use crate::{
-    fts::{builder::MAX_TOKEN_LENGTH, stemmer::Stemmer, tokenizers::Tokenizer},
-    BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
+    fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
 };
 
-use super::{term_index::TermIndex, Language};
+use super::term_index::TermIndex;
 
 impl ReadTransaction<'_> {
     #[maybe_async::maybe_async]
@@ -44,7 +44,7 @@ impl ReadTransaction<'_> {
         if match_phrase {
             let mut phrase = Vec::new();
             let mut bit_keys = Vec::new();
-            for token in Tokenizer::new(text, language, MAX_TOKEN_LENGTH) {
+            for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) {
                 let key = BitmapKey::hash(
                     token.word.as_ref(),
                     account_id,
diff --git a/crates/store/src/fts/search_snippet.rs b/crates/store/src/fts/search_snippet.rs
index 89c557b1..55d6b6b7 100644
--- a/crates/store/src/fts/search_snippet.rs
+++ b/crates/store/src/fts/search_snippet.rs
@@ -134,12 +134,10 @@ pub fn generate_snippet(terms: &[Term], text: &str) -> Option<String> {
 #[cfg(test)]
 mod tests {
 
+    use nlp::language::Language;
+
     use crate::{
-        fts::{
-            term_index::{TermIndex, TermIndexBuilder},
-            tokenizers::Tokenizer,
-            Language,
-        },
+        fts::term_index::{TermIndex, TermIndexBuilder},
         Deserialize, Serialize,
     };
 
@@ -242,7 +240,7 @@ mod tests {
 
             for (field_num, part) in parts.iter().enumerate() {
                 let mut terms = Vec::new();
-                for token in Tokenizer::new(part, Language::English, 40) {
+                for token in Language::English.tokenize_text(part, 40) {
                     terms.push(builder.add_token(token));
                 }
                 builder.add_terms(field_num as u8, 0, terms);
diff --git a/crates/store/src/fts/term_index.rs b/crates/store/src/fts/term_index.rs
index e2653853..b91f74db 100644
--- a/crates/store/src/fts/term_index.rs
+++ b/crates/store/src/fts/term_index.rs
@@ -21,14 +21,13 @@
  * for more details.
 */
 
-use std::convert::TryInto;
+use std::{borrow::Cow, convert::TryInto};
 
 use crate::{Deserialize, Serialize};
 
-use super::{stemmer::StemmedToken, tokenizers::Token};
-
 use ahash::{AHashMap, AHashSet};
 use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
+use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
 use utils::codec::leb128::{Leb128Reader, Leb128Vec};
 
 #[derive(Debug)]
@@ -227,7 +226,7 @@ impl TermIndexBuilder {
         }
     }
 
-    pub fn add_token(&mut self, token: Token) -> Term {
+    pub fn add_token(&mut self, token: Token<Cow<str>>) -> Term {
         let id = self.terms.len() as u32;
         let id = self
             .terms
@@ -236,8 +235,8 @@ impl TermIndexBuilder {
         Term {
             id: *id,
             id_stemmed: *id,
-            offset: token.offset,
-            len: token.len,
+            offset: token.from as u32,
+            len: (token.to - token.from) as u8,
         }
     }
 
@@ -259,8 +258,8 @@ impl TermIndexBuilder {
         Term {
             id,
             id_stemmed,
-            offset: token.offset,
-            len: token.len,
+            offset: token.from as u32,
+            len: (token.to - token.from) as u8,
         }
     }
 
@@ -775,13 +774,10 @@ impl TokenIndex {
 mod tests {
 
     use ahash::AHashMap;
+    use nlp::language::{stemmer::Stemmer, Language};
 
     use crate::{
-        fts::{
-            stemmer::Stemmer,
-            term_index::{TermIndexBuilder, TokenIndex},
-            Language,
-        },
+        fts::term_index::{TermIndexBuilder, TokenIndex},
         Deserialize, Serialize,
     };
 
diff --git a/crates/store/src/fts/tokenizers/chinese.rs b/crates/store/src/fts/tokenizers/chinese.rs
deleted file mode 100644
index e741571d..00000000
--- a/crates/store/src/fts/tokenizers/chinese.rs
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::{borrow::Cow, vec::IntoIter};
-
-use jieba_rs::Jieba;
-
-use super::{word::WordTokenizer, Token};
-use lazy_static::lazy_static;
-
-lazy_static! {
-    static ref JIEBA: Jieba = Jieba::new();
-}
-
-pub struct ChineseTokenizer<'x> {
-    word_tokenizer: WordTokenizer<'x>,
-    tokens: IntoIter<&'x str>,
-    token_offset: usize,
-    token_len: usize,
-    token_len_cur: usize,
-    max_token_length: usize,
-}
-
-impl<'x> ChineseTokenizer<'x> {
-    pub fn new(text: &str, max_token_length: usize) -> ChineseTokenizer {
-        ChineseTokenizer {
-            word_tokenizer: WordTokenizer::new(text),
-            tokens: Vec::new().into_iter(),
-            max_token_length,
-            token_offset: 0,
-            token_len: 0,
-            token_len_cur: 0,
-        }
-    }
-}
-
-impl<'x> Iterator for ChineseTokenizer<'x> {
-    type Item = Token<'x>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            if let Some(ch_token) = self.tokens.next() {
-                let offset_start = self.token_offset + self.token_len_cur;
-                self.token_len_cur += ch_token.len();
-
-                if ch_token.len() <= self.max_token_length {
-                    return Token::new(offset_start, ch_token.len(), ch_token.into()).into();
-                }
-            } else {
-                loop {
-                    let (token, is_ascii) = self.word_tokenizer.next()?;
-                    if !is_ascii {
-                        let word = match token.word {
-                            Cow::Borrowed(word) => word,
-                            Cow::Owned(_) => unreachable!(),
-                        };
-                        self.tokens = JIEBA.cut(word, false).into_iter();
-                        self.token_offset = token.offset as usize;
-                        self.token_len = token.len as usize;
-                        self.token_len_cur = 0;
-                        break;
-                    } else if token.len as usize <= self.max_token_length {
-                        return token.into();
-                    }
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn chinese_tokenizer() {
-        assert_eq!(
-            ChineseTokenizer::new(
-                "孫子曰：兵者，國之大事，死生之地，存亡之道，不可不察也。",
-                40
-            )
-            .collect::<Vec<_>>(),
-            vec![
-                Token {
-                    word: "孫".into(),
-                    offset: 0,
-                    len: 3
-                },
-                Token {
-                    word: "子".into(),
-                    offset: 3,
-                    len: 3
-                },
-                Token {
-                    word: "曰".into(),
-                    offset: 6,
-                    len: 3
-                },
-                Token {
-                    word: "兵".into(),
-                    offset: 12,
-                    len: 3
-                },
-                Token {
-                    word: "者".into(),
-                    offset: 15,
-                    len: 3
-                },
-                Token {
-                    word: "國".into(),
-                    offset: 21,
-                    len: 3
-                },
-                Token {
-                    word: "之".into(),
-                    offset: 24,
-                    len: 3
-                },
-                Token {
-                    word: "大事".into(),
-                    offset: 27,
-                    len: 6
-                },
-                Token {
-                    word: "死".into(),
-                    offset: 36,
-                    len: 3
-                },
-                Token {
-                    word: "生".into(),
-                    offset: 39,
-                    len: 3
-                },
-                Token {
-                    word: "之".into(),
-                    offset: 42,
-                    len: 3
-                },
-                Token {
-                    word: "地".into(),
-                    offset: 45,
-                    len: 3
-                },
-                Token {
-                    word: "存亡".into(),
-                    offset: 51,
-                    len: 6
-                },
-                Token {
-                    word: "之".into(),
-                    offset: 57,
-                    len: 3
-                },
-                Token {
-                    word: "道".into(),
-                    offset: 60,
-                    len: 3
-                },
-                Token {
-                    word: "不可不".into(),
-                    offset: 66,
-                    len: 9
-                },
-                Token {
-                    word: "察".into(),
-                    offset: 75,
-                    len: 3
-                },
-                Token {
-                    word: "也".into(),
-                    offset: 78,
-                    len: 3
-                }
-            ]
-        );
-    }
-}
diff --git a/crates/store/src/fts/tokenizers/japanese.rs b/crates/store/src/fts/tokenizers/japanese.rs
deleted file mode 100644
index 816ba0a3..00000000
--- a/crates/store/src/fts/tokenizers/japanese.rs
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::vec::IntoIter;
-
-use super::{word::WordTokenizer, Token};
-
-pub struct JapaneseTokenizer<'x> {
-    word_tokenizer: WordTokenizer<'x>,
-    tokens: IntoIter<String>,
-    token_offset: usize,
-    token_len: usize,
-    token_len_cur: usize,
-    max_token_length: usize,
-}
-
-impl<'x> JapaneseTokenizer<'x> {
-    pub fn new(text: &str, max_token_length: usize) -> JapaneseTokenizer {
-        JapaneseTokenizer {
-            word_tokenizer: WordTokenizer::new(text),
-            tokens: Vec::new().into_iter(),
-            max_token_length,
-            token_offset: 0,
-            token_len: 0,
-            token_len_cur: 0,
-        }
-    }
-}
-
-impl<'x> Iterator for JapaneseTokenizer<'x> {
-    type Item = Token<'x>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            if let Some(jp_token) = self.tokens.next() {
-                let offset_start = self.token_offset + self.token_len_cur;
-                self.token_len_cur += jp_token.len();
-
-                if jp_token.len() <= self.max_token_length {
-                    return Token::new(offset_start, jp_token.len(), jp_token.into()).into();
-                }
-            } else {
-                loop {
-                    let (token, is_ascii) = self.word_tokenizer.next()?;
-                    if !is_ascii {
-                        self.tokens = tinysegmenter::tokenize(token.word.as_ref()).into_iter();
-                        self.token_offset = token.offset as usize;
-                        self.token_len = token.len as usize;
-                        self.token_len_cur = 0;
-                        break;
-                    } else if token.len as usize <= self.max_token_length {
-                        return token.into();
-                    }
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn japanese_tokenizer() {
-        assert_eq!(
-            JapaneseTokenizer::new("お先に失礼します あなたの名前は何ですか 123 abc-872", 40)
-                .collect::<Vec<_>>(),
-            vec![
-                Token {
-                    word: "お先".into(),
-                    offset: 0,
-                    len: 6
-                },
-                Token {
-                    word: "に".into(),
-                    offset: 6,
-                    len: 3
-                },
-                Token {
-                    word: "失礼".into(),
-                    offset: 9,
-                    len: 6
-                },
-                Token {
-                    word: "し".into(),
-                    offset: 15,
-                    len: 3
-                },
-                Token {
-                    word: "ます".into(),
-                    offset: 18,
-                    len: 6
-                },
-                Token {
-                    word: "あなた".into(),
-                    offset: 25,
-                    len: 9
-                },
-                Token {
-                    word: "の".into(),
-                    offset: 34,
-                    len: 3
-                },
-                Token {
-                    word: "名前".into(),
-                    offset: 37,
-                    len: 6
-                },
-                Token {
-                    word: "は".into(),
-                    offset: 43,
-                    len: 3
-                },
-                Token {
-                    word: "何".into(),
-                    offset: 46,
-                    len: 3
-                },
-                Token {
-                    word: "です".into(),
-                    offset: 49,
-                    len: 6
-                },
-                Token {
-                    word: "か".into(),
-                    offset: 55,
-                    len: 3
-                },
-                Token {
-                    word: "123".into(),
-                    offset: 59,
-                    len: 3
-                },
-                Token {
-                    word: "abc".into(),
-                    offset: 63,
-                    len: 3
-                },
-                Token {
-                    word: "872".into(),
-                    offset: 67,
-                    len: 3
-                }
-            ]
-        );
-    }
-}
diff --git a/crates/store/src/fts/tokenizers/mod.rs b/crates/store/src/fts/tokenizers/mod.rs
deleted file mode 100644
index 3679b2b3..00000000
--- a/crates/store/src/fts/tokenizers/mod.rs
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-pub mod chinese;
-pub mod indo_european;
-pub mod japanese;
-pub mod space;
-pub mod word;
-
-use std::borrow::Cow;
-
-use self::{
-    chinese::ChineseTokenizer, indo_european::IndoEuropeanTokenizer, japanese::JapaneseTokenizer,
-};
-
-use super::Language;
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct Token<'x> {
-    pub word: Cow<'x, str>,
-    pub offset: u32, // Word offset in the text part
-    pub len: u8,     // Word length
-}
-
-impl<'x> Token<'x> {
-    pub fn new(offset: usize, len: usize, word: Cow<'x, str>) -> Token<'x> {
-        debug_assert!(offset <= u32::max_value() as usize);
-        debug_assert!(len <= u8::max_value() as usize);
-        Token {
-            offset: offset as u32,
-            len: len as u8,
-            word,
-        }
-    }
-}
-
-enum LanguageTokenizer<'x> {
-    IndoEuropean(IndoEuropeanTokenizer<'x>),
-    Japanese(JapaneseTokenizer<'x>),
-    Chinese(ChineseTokenizer<'x>),
-}
-
-pub struct Tokenizer<'x> {
-    tokenizer: LanguageTokenizer<'x>,
-}
-
-impl<'x> Tokenizer<'x> {
-    pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Self {
-        Tokenizer {
-            tokenizer: match language {
-                Language::Japanese => {
-                    LanguageTokenizer::Japanese(JapaneseTokenizer::new(text, max_token_length))
-                }
-                Language::Mandarin => {
-                    LanguageTokenizer::Chinese(ChineseTokenizer::new(text, max_token_length))
-                }
-                _ => LanguageTokenizer::IndoEuropean(IndoEuropeanTokenizer::new(
-                    text,
-                    max_token_length,
-                )),
-            },
-        }
-    }
-}
-
-impl<'x> Iterator for Tokenizer<'x> {
-    type Item = Token<'x>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        match &mut self.tokenizer {
-            LanguageTokenizer::IndoEuropean(tokenizer) => tokenizer.next(),
-            LanguageTokenizer::Chinese(tokenizer) => tokenizer.next(),
-            LanguageTokenizer::Japanese(tokenizer) => tokenizer.next(),
-        }
-    }
-}
diff --git a/crates/store/src/fts/tokenizers/word.rs b/crates/store/src/fts/tokenizers/word.rs
deleted file mode 100644
index 3e50ba1a..00000000
--- a/crates/store/src/fts/tokenizers/word.rs
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2023, Stalwart Labs Ltd.
- *
- * This file is part of Stalwart Mail Server.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- * in the LICENSE file at the top-level directory of this distribution.
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * You can be released from the requirements of the AGPLv3 license by
- * purchasing a commercial license. Please contact licensing@stalw.art
- * for more details.
-*/
-
-use std::str::CharIndices;
-
-use super::Token;
-
-pub struct WordTokenizer<'x> {
-    text: &'x str,
-    iterator: CharIndices<'x>,
-}
-
-impl<'x> WordTokenizer<'x> {
-    pub fn new(text: &str) -> WordTokenizer {
-        WordTokenizer {
-            text,
-            iterator: text.char_indices(),
-        }
-    }
-}
-
-/// Parses text into tokens, used by non-IndoEuropean tokenizers.
-impl<'x> Iterator for WordTokenizer<'x> {
-    type Item = (Token<'x>, bool);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let mut is_ascii = true;
-        while let Some((token_start, ch)) = self.iterator.next() {
-            if ch.is_alphanumeric() {
-                let token_end = (&mut self.iterator)
-                    .filter_map(|(pos, ch)| {
-                        if ch.is_alphanumeric() {
-                            if is_ascii && !ch.is_ascii() {
-                                is_ascii = false;
-                            }
-                            None
-                        } else {
-                            pos.into()
-                        }
-                    })
-                    .next()
-                    .unwrap_or(self.text.len());
-
-                let token_len = token_end - token_start;
-                if token_end > token_start {
-                    return (
-                        Token::new(
-                            token_start,
-                            token_len,
-                            self.text[token_start..token_end].into(),
-                        ),
-                        is_ascii,
-                    )
-                        .into();
-                }
-            }
-        }
-        None
-    }
-}
diff --git a/crates/store/src/query/filter.rs b/crates/store/src/query/filter.rs
index 5b74a9ae..9e4b7109 100644
--- a/crates/store/src/query/filter.rs
+++ b/crates/store/src/query/filter.rs
@@ -24,12 +24,10 @@
 use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};
 
 use ahash::HashSet;
+use nlp::tokenizers::space::SpaceTokenizer;
 use roaring::RoaringBitmap;
 
-use crate::{
-    fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
-    BitmapKey, ReadTransaction, Store,
-};
+use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, Store};
 
 use super::{Filter, ResultSet, TextMatch};
 
diff --git a/crates/store/src/query/mod.rs b/crates/store/src/query/mod.rs
index 86f7eec9..05442caf 100644
--- a/crates/store/src/query/mod.rs
+++ b/crates/store/src/query/mod.rs
@@ -26,11 +26,10 @@ pub mod get;
 pub mod log;
 pub mod sort;
 
+use nlp::language::Language;
 use roaring::RoaringBitmap;
 
-use crate::{
-    fts::Language, write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS,
-};
+use crate::{write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS};
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Operator {
diff --git a/crates/store/src/write/mod.rs b/crates/store/src/write/mod.rs
index 48d8027a..44826133 100644
--- a/crates/store/src/write/mod.rs
+++ b/crates/store/src/write/mod.rs
@@ -23,11 +23,11 @@
 
 use std::{collections::HashSet, slice::Iter, time::SystemTime};
 
+use nlp::tokenizers::space::SpaceTokenizer;
 use utils::codec::leb128::{Leb128Iterator, Leb128Vec};
 
 use crate::{
-    fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
-    Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
+    fts::builder::MAX_TOKEN_LENGTH, Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
 };
 
 use self::assert::AssertValue;
diff --git a/tests/Cargo.toml b/tests/Cargo.toml
index cce3410f..920cd28c 100644
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -12,6 +12,7 @@ foundationdb = ["store/foundation"]
 
 [dependencies]
 store = { path = "../crates/store", features = ["test_mode"] }
+nlp = { path = "../crates/nlp" }
 directory = { path = "../crates/directory" }
 jmap = { path = "../crates/jmap", features = ["test_mode"] }
 jmap_proto = { path = "../crates/jmap-proto" }
diff --git a/tests/src/store/query.rs b/tests/src/store/query.rs
index cd154593..7e1ab6c9 100644
--- a/tests/src/store/query.rs
+++ b/tests/src/store/query.rs
@@ -27,10 +27,11 @@ use std::{
 };
 
 use jmap_proto::types::keyword::Keyword;
+use nlp::language::Language;
 use store::{ahash::AHashMap, query::sort::Pagination};
 
 use store::{
-    fts::{builder::FtsIndexBuilder, Language},
+    fts::builder::FtsIndexBuilder,
     query::{Comparator, Filter},
     write::{BatchBuilder, F_BITMAP, F_INDEX, F_VALUE},
     Store, ValueKey,