mirror of
https://github.com/stalwartlabs/mail-server.git
synced 2024-09-20 07:16:18 +08:00
Bayes classifier, type tokenizer and NLP module reorganization
This commit is contained in:
parent
a0812095ef
commit
3d9efd363a
|
@ -5,7 +5,8 @@ All notable changes to this project will be documented in this file. This projec
|
|||
## [0.3.9] - 2023-10-07
|
||||
|
||||
## Added
|
||||
- Support for reading environment variables from configuration file using the `!ENV_VAR_NAME` special keyword.
|
||||
- Support for reading environment variables from the configuration file using the `!ENV_VAR_NAME` special keyword.
|
||||
- Option to disable ANSI color codes in logs.
|
||||
|
||||
### Changed
|
||||
- Querying directories from a Sieve script is now done using the `query()` method from `eval`. Your scripts will need to be updated, please refer to the [new syntax](https://stalw.art/docs/smtp/filter/sieve#directory-queries).
|
||||
|
|
116
Cargo.lock
generated
116
Cargo.lock
generated
|
@ -169,13 +169,6 @@ dependencies = [
|
|||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "antispam"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"fancy-regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.75"
|
||||
|
@ -1487,25 +1480,14 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
|
|||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "add4f07d43996f76ef320709726a556a9d4f965d9410d8d0271132d2f8293480"
|
||||
checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
|
||||
dependencies = [
|
||||
"errno-dragonfly",
|
||||
"libc",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno-dragonfly"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "etcetera"
|
||||
version = "0.8.0"
|
||||
|
@ -2252,6 +2234,7 @@ dependencies = [
|
|||
"mail-parser",
|
||||
"mail-send",
|
||||
"md5",
|
||||
"nlp",
|
||||
"parking_lot",
|
||||
"rustls 0.21.7",
|
||||
"rustls-pemfile",
|
||||
|
@ -2450,6 +2433,7 @@ dependencies = [
|
|||
"mail-parser",
|
||||
"mail-send",
|
||||
"mime",
|
||||
"nlp",
|
||||
"p256",
|
||||
"rand 0.8.5",
|
||||
"rasn",
|
||||
|
@ -2510,9 +2494,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.26"
|
||||
version = "0.1.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
|
||||
checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
@ -2703,9 +2687,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.8"
|
||||
version = "0.4.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db"
|
||||
checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
|
@ -2754,7 +2738,7 @@ dependencies = [
|
|||
"mail-parser",
|
||||
"parking_lot",
|
||||
"quick-xml 0.30.0",
|
||||
"ring 0.17.2",
|
||||
"ring 0.17.3",
|
||||
"rustls-pemfile",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -3001,6 +2985,30 @@ dependencies = [
|
|||
"pin-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nlp"
|
||||
version = "0.3.9"
|
||||
dependencies = [
|
||||
"ahash 0.8.3",
|
||||
"bincode",
|
||||
"farmhash",
|
||||
"jieba-rs",
|
||||
"lazy_static",
|
||||
"nohash",
|
||||
"rust-stemmers",
|
||||
"serde",
|
||||
"siphasher 1.0.0",
|
||||
"tinysegmenter",
|
||||
"whatlang",
|
||||
"xxhash-rust",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nohash"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a0f889fb66f7acdf83442c35775764b51fed3c606ab9cee51500dbde2cf528ca"
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "7.1.3"
|
||||
|
@ -3072,9 +3080,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.16"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
|
||||
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"libm",
|
||||
|
@ -3476,7 +3484,7 @@ version = "0.10.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
"siphasher 0.3.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -3485,7 +3493,7 @@ version = "0.11.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
"siphasher 0.3.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -3791,9 +3799,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rasn"
|
||||
version = "0.10.0"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2cf5174961dbfd4f03b57e71e5a11b034f564d5f0b133d63e39d703ac3d2876b"
|
||||
checksum = "4addd1a49756bcb131c2f686c6c833d2b63e4da7a0df07efd8c3de04b7efbdb2"
|
||||
dependencies = [
|
||||
"arrayvec",
|
||||
"bitvec",
|
||||
|
@ -3813,9 +3821,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rasn-cms"
|
||||
version = "0.10.0"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56517898cf38bb50fdb6479049ed476510bf59ae7d329b35129dc8a8b309697f"
|
||||
checksum = "e269b4df6eea0f54abd46afacd759b1c13a27e98da98a47ef3c405ef3568b0f5"
|
||||
dependencies = [
|
||||
"rasn",
|
||||
"rasn-pkix",
|
||||
|
@ -3823,9 +3831,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rasn-derive"
|
||||
version = "0.10.0"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8def4ce07f970be91bad36c3090af419dcd9e696897ada3cf74bd480e0101d61"
|
||||
checksum = "ba8242a16e3461b81333516ad8457906f52fdf21d087417fb59262c9ab406618"
|
||||
dependencies = [
|
||||
"either",
|
||||
"itertools 0.10.5",
|
||||
|
@ -3838,9 +3846,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rasn-pkix"
|
||||
version = "0.10.0"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebdeef45b70d4c20ce34725707b2784c761eacaaa4d841eab46f9f9c6dc10dd3"
|
||||
checksum = "06179c947a63fe9f9f5d73a539dcb13d90c6bdaeb03bd28b90ad796aff9fe6a8"
|
||||
dependencies = [
|
||||
"rasn",
|
||||
]
|
||||
|
@ -4024,9 +4032,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.2"
|
||||
version = "0.17.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "911b295d2d302948838c8ac142da1ee09fa7863163b44e6715bc9357905878b8"
|
||||
checksum = "9babe80d5c16becf6594aa32ad2be8fe08498e7ae60b77de8df700e67f191d7e"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"getrandom 0.2.10",
|
||||
|
@ -4198,9 +4206,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.17"
|
||||
version = "0.38.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7"
|
||||
checksum = "5a74ee2d7c2581cd139b42447d7d9389b889bdaad3a73f1ebb16f2a3237bb19c"
|
||||
dependencies = [
|
||||
"bitflags 2.4.0",
|
||||
"errno",
|
||||
|
@ -4644,6 +4652,12 @@ version = "0.3.11"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.9"
|
||||
|
@ -5048,10 +5062,10 @@ dependencies = [
|
|||
"farmhash",
|
||||
"foundationdb",
|
||||
"futures",
|
||||
"jieba-rs",
|
||||
"lazy_static",
|
||||
"lru-cache",
|
||||
"maybe-async 0.2.7",
|
||||
"nlp",
|
||||
"num_cpus",
|
||||
"parking_lot",
|
||||
"r2d2",
|
||||
|
@ -5061,14 +5075,11 @@ dependencies = [
|
|||
"rocksdb",
|
||||
"rusqlite",
|
||||
"rust-s3",
|
||||
"rust-stemmers",
|
||||
"serde",
|
||||
"siphasher",
|
||||
"tinysegmenter",
|
||||
"siphasher 1.0.0",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"utils",
|
||||
"whatlang",
|
||||
"xxhash-rust",
|
||||
]
|
||||
|
||||
|
@ -5244,6 +5255,7 @@ dependencies = [
|
|||
"mail-parser",
|
||||
"mail-send",
|
||||
"managesieve",
|
||||
"nlp",
|
||||
"num_cpus",
|
||||
"rayon",
|
||||
"reqwest",
|
||||
|
@ -5358,9 +5370,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
|||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.32.0"
|
||||
version = "1.33.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9"
|
||||
checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"bytes",
|
||||
|
@ -6040,12 +6052,12 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "webpki"
|
||||
version = "0.22.2"
|
||||
version = "0.22.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
|
||||
checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53"
|
||||
dependencies = [
|
||||
"ring 0.16.20",
|
||||
"untrusted 0.7.1",
|
||||
"ring 0.17.3",
|
||||
"untrusted 0.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -8,9 +8,9 @@ members = [
|
|||
"crates/imap-proto",
|
||||
"crates/smtp",
|
||||
"crates/managesieve",
|
||||
"crates/nlp",
|
||||
"crates/store",
|
||||
"crates/directory",
|
||||
"crates/antispam",
|
||||
"crates/utils",
|
||||
"crates/maybe-async",
|
||||
"crates/cli",
|
||||
|
|
|
@ -38,6 +38,7 @@ Key features:
|
|||
- OAuth 2.0 [authorization code](https://www.rfc-editor.org/rfc/rfc8628) and [device authorization](https://www.rfc-editor.org/rfc/rfc8628) flows.
|
||||
- Access Control Lists (ACLs).
|
||||
- Rate limiting.
|
||||
- Security audited (read the [report](https://stalw.art/blog/security-audit)).
|
||||
- **Robust and scalable**:
|
||||
- **FoundationDB** or **SQLite** database backends.
|
||||
- **S3-compatible** blob storage support.
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
[package]
|
||||
name = "antispam"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
fancy-regex = "0.11.0"
|
|
@ -1,64 +0,0 @@
|
|||
use std::path::PathBuf;
|
||||
|
||||
use import::spamassassin::import_spamassassin;
|
||||
|
||||
pub mod import;
|
||||
|
||||
fn main() {
|
||||
import_spamassassin(
|
||||
PathBuf::from("/Users/me/code/mail-server/resources/spamassassin"),
|
||||
"cf".to_string(),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
const _IGNORE: &str = r#"
|
||||
|
||||
[antispam]
|
||||
required-score = 5
|
||||
add-headers = ["X-Spam-Checker-Version: SpamAssassin _VERSION_ (_SUBVERSION_) on _HOSTNAME_",
|
||||
"X-Spam-Flag: _YESNOCAPS_", "X-Spam-Level: _STARS(*)_",
|
||||
"X-Spam-Status: _YESNO_, score=_SCORE_ required=_REQD_ tests=_TESTS_ autolearn=_AUTOLEARN_ version=_VERSION_"]
|
||||
originating-ip-headers = ["X-Yahoo-Post-IP", "X-Originating-IP", "X-Apparently-From",
|
||||
"X-SenderIP X-AOL-IP", "X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp"]
|
||||
rewrite-headers = ["Subject: [SPAM] _SUBJECT_"]
|
||||
redirect-patterns = ["""m'/(?:index.php)?\?.*(?<=[?&])URL=(.*?)(?:$|[&\#])'i""",
|
||||
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/url\?.*?(?<=[?&])q=(.*?)(?:$|[&\#])'i""",
|
||||
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:site|inurl):(.*?)(?:$|%20|[\s+&\#])'i""",
|
||||
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:"|%22)(.*?)(?:$|%22|["\s+&\#])'i""",
|
||||
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/translate\?.*?(?<=[?&])u=(.*?)(?:$|[&\#])'i""",
|
||||
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/pagead/iclk\?.*?(?<=[?&])adurl=(.*?)(?:$|[&\#])'i""",
|
||||
"""m'^https?:/*(?:\w+\.)?aol\.com/redir\.adp\?.*(?<=[?&])_url=(.*?)(?:$|[&\#])'i""",
|
||||
"""m'^https?/*(?:\w+\.)?facebook\.com/l/;(.*)'i""",
|
||||
"""/^http:\/\/chkpt\.zdnet\.com\/chkpt\/\w+\/(.*)$/i""",
|
||||
"""/^http:\/\/www(?:\d+)?\.nate\.com\/r\/\w+\/(.*)$/i""",
|
||||
"""/^http:\/\/.+\.gov\/(?:.*\/)?externalLink\.jhtml\?.*url=(.*?)(?:&.*)?$/i""",
|
||||
"""/^http:\/\/redir\.internet\.com\/.+?\/.+?\/(.*)$/i""",
|
||||
"""/^http:\/\/(?:.*?\.)?adtech\.de\/.*(?:;|\|)link=(.*?)(?:;|$)/i""",
|
||||
"""m'^http.*?/redirect\.php\?.*(?<=[?&])goto=(.*?)(?:$|[&\#])'i""",
|
||||
"""m'^https?:/*(?:[^/]+\.)?emf\d\.com/r\.cfm.*?&r=(.*)'i"""
|
||||
]
|
||||
|
||||
[antispam.autolearn]
|
||||
enable = true
|
||||
ignore-headers = [ "X-ACL-Warn", "X-Alimail-AntiSpam", "X-Amavis-Modified", "X-Anti*", "X-aol-global-disposition",
|
||||
"X-ASF-*", "X-Assp-Version", "X-Authority-Analysis", "X-Authvirus", "X-Auto-Response-Suppress", "X-AV-Do-Run",
|
||||
"X-AV-Status", "X-avast-antispam", "X-Backend", "X-Barracuda*", "X-Bayes*", "X-BitDefender*", "X-BL", "X-Bogosity",
|
||||
"X-Boxtrapper", "X-Brightmail-Tracker", "X-BTI-AntiSpam", "X-Bugzilla-Version", "X-CanIt*", "X-Clapf-spamicity",
|
||||
"X-Cloud-Security", "X-CM-Score", "X-CMAE-*", "X-Company", "X-Coremail-Antispam", "X-CRM114-*", "X-CT-Spam",
|
||||
"X-CTCH-*", "X-Drweb-SpamState", "X-DSPAM*", "X-eavas*", "X-Enigmail-Version", "X-Eset*", "X-Exchange-Antispam-Report",
|
||||
"X-ExtloopSabreCommercials1", "X-EYOU-SPAMVALUE", "X-FB-OUTBOUND-SPAM", "X-FEAS-SBL", "X-FILTER-SCORE", "X-Forefront*",
|
||||
"X-Fuglu*", "X-getmail-filter-classifier", "X-GFIME-MASPAM", "X-Gmane-NNTP-Posting-Host", "X-GMX-Anti*", "X-He-Spam",
|
||||
"X-hMailServer-Spam", "X-IAS", "X-iGspam-global", "X-Injected-Via-Gmane", "X-Interia-Antivirus", "X-IP-Spam-Verdict",
|
||||
"X-Ironport*", "X-Junk*", "X-KLMS-*", "X-KMail-*", "X-MailCleaner-*", "X-MailFoundry", "X-MDMailLookup-Result",
|
||||
"X-ME-*", "X-MessageFilter", "X-Microsoft-Antispam", "X-Mlf-Version", "X-MXScan-*", "X-NAI-Spam-*", "X-NetStation-Status",
|
||||
"X-OVH-SPAM*", "X-PerlMx-*", "X-PFSI-Info", "X-PMX-*", "X-Policy-Service", "X-policyd-weight", "X-PreRBLs",
|
||||
"X-Probable-Spam", "X-PROLinux-SpamCheck", "X-Proofpoint-*", "x-purgate-*", "X-Qmail-Scanner-*", "X-Quarantine-ID",
|
||||
"X-RSpam-Report", "X-SA-*", "X-Scanned-by", "X-SmarterMail-CustomSpamHeader", "X-Spam*", "X-SPF-Scan-By", "X-STA-*",
|
||||
"X-StarScan-Version", "X-SurGATE-Result", "X-SWITCHham-Score", "X-UI-*", "X-Univie*", "X-Virus*", "X-VR-*",
|
||||
"X-WatchGuard*", "X-Whitelist-Domain", "X-WUM-CCI", "X_CMAE_Category" ]
|
||||
threshold.ham = 0.1
|
||||
threshold.spam = 12.0
|
||||
|
||||
|
||||
"#;
|
|
@ -10,6 +10,7 @@ jmap = { path = "../jmap" }
|
|||
jmap_proto = { path = "../jmap-proto" }
|
||||
directory = { path = "../directory" }
|
||||
store = { path = "../store" }
|
||||
nlp = { path = "../nlp" }
|
||||
utils = { path = "../utils" }
|
||||
mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "ludicrous_mode"] }
|
||||
mail-send = { git = "https://github.com/stalwartlabs/mail-send", default-features = false, features = ["cram-md5", "skip-ehlo"] }
|
||||
|
|
|
@ -34,8 +34,9 @@ use imap_proto::{
|
|||
|
||||
use jmap_proto::types::{collection::Collection, id::Id, keyword::Keyword, property::Property};
|
||||
use mail_parser::HeaderName;
|
||||
use nlp::language::Language;
|
||||
use store::{
|
||||
fts::{builder::MAX_TOKEN_LENGTH, Language},
|
||||
fts::builder::MAX_TOKEN_LENGTH,
|
||||
query::{self, log::Query, sort::Pagination, ResultSet},
|
||||
roaring::RoaringBitmap,
|
||||
write::now,
|
||||
|
|
|
@ -6,6 +6,7 @@ resolver = "2"
|
|||
|
||||
[dependencies]
|
||||
store = { path = "../store" }
|
||||
nlp = { path = "../nlp" }
|
||||
jmap_proto = { path = "../jmap-proto" }
|
||||
smtp = { path = "../smtp" }
|
||||
utils = { path = "../utils" }
|
||||
|
|
|
@ -23,10 +23,8 @@
|
|||
|
||||
use std::{str::FromStr, time::Duration};
|
||||
|
||||
use store::{
|
||||
fts::Language,
|
||||
rand::{distributions::Alphanumeric, thread_rng, Rng},
|
||||
};
|
||||
use nlp::language::Language;
|
||||
use store::rand::{distributions::Alphanumeric, thread_rng, Rng};
|
||||
|
||||
use super::session::BaseCapabilities;
|
||||
|
||||
|
|
|
@ -37,11 +37,9 @@ use mail_parser::{
|
|||
parsers::{fields::thread::thread_name, preview::preview_text},
|
||||
Addr, Address, GetHeader, Group, HeaderName, HeaderValue, Message, MessagePart, PartType,
|
||||
};
|
||||
use nlp::language::Language;
|
||||
use store::{
|
||||
fts::{
|
||||
builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
|
||||
Language,
|
||||
},
|
||||
fts::builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
|
||||
write::{BatchBuilder, IntoOperations, F_BITMAP, F_CLEAR, F_INDEX, F_VALUE},
|
||||
};
|
||||
|
||||
|
|
|
@ -28,8 +28,9 @@ use jmap_proto::{
|
|||
types::{acl::Acl, collection::Collection, keyword::Keyword, property::Property},
|
||||
};
|
||||
use mail_parser::HeaderName;
|
||||
use nlp::language::Language;
|
||||
use store::{
|
||||
fts::{builder::MAX_TOKEN_LENGTH, Language},
|
||||
fts::builder::MAX_TOKEN_LENGTH,
|
||||
query::{self},
|
||||
roaring::RoaringBitmap,
|
||||
ValueKey,
|
||||
|
|
|
@ -30,14 +30,12 @@ use jmap_proto::{
|
|||
types::{acl::Acl, collection::Collection},
|
||||
};
|
||||
use mail_parser::{decoders::html::html_to_text, MessageParser, PartType};
|
||||
use nlp::language::{stemmer::Stemmer, Language};
|
||||
use store::{
|
||||
fts::{
|
||||
builder::MAX_TOKEN_LENGTH,
|
||||
search_snippet::generate_snippet,
|
||||
stemmer::Stemmer,
|
||||
term_index::{self, TermIndex},
|
||||
tokenizers::Tokenizer,
|
||||
Language,
|
||||
},
|
||||
BlobKind,
|
||||
};
|
||||
|
@ -66,7 +64,8 @@ impl JMAP {
|
|||
|| (text.starts_with('\'') && text.ends_with('\''))
|
||||
{
|
||||
terms.push(
|
||||
Tokenizer::new(&text, language, MAX_TOKEN_LENGTH)
|
||||
language
|
||||
.tokenize_text(&text, MAX_TOKEN_LENGTH)
|
||||
.map(|token| (token.word.into_owned(), None))
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
|
|
|
@ -40,6 +40,7 @@ use jmap_proto::{
|
|||
},
|
||||
types::{collection::Collection, property::Property},
|
||||
};
|
||||
use nlp::language::Language;
|
||||
use services::{
|
||||
delivery::spawn_delivery_manager,
|
||||
housekeeper::{self, init_housekeeper, spawn_housekeeper},
|
||||
|
@ -47,7 +48,6 @@ use services::{
|
|||
};
|
||||
use smtp::core::SMTP;
|
||||
use store::{
|
||||
fts::Language,
|
||||
parking_lot::Mutex,
|
||||
query::{sort::Pagination, Comparator, Filter, ResultSet, SortedResultSet},
|
||||
roaring::RoaringBitmap,
|
||||
|
|
|
@ -27,9 +27,9 @@ use jmap_proto::{
|
|||
object::{mailbox::QueryArguments, Object},
|
||||
types::{acl::Acl, collection::Collection, property::Property, value::Value},
|
||||
};
|
||||
use nlp::language::Language;
|
||||
use store::{
|
||||
ahash::{AHashMap, AHashSet},
|
||||
fts::Language,
|
||||
query::{self, sort::Pagination},
|
||||
roaring::RoaringBitmap,
|
||||
};
|
||||
|
|
|
@ -28,10 +28,8 @@ use jmap_proto::{
|
|||
},
|
||||
types::{collection::Collection, property::Property},
|
||||
};
|
||||
use store::{
|
||||
fts::Language,
|
||||
query::{self},
|
||||
};
|
||||
use nlp::language::Language;
|
||||
use store::query::{self};
|
||||
|
||||
use crate::JMAP;
|
||||
|
||||
|
|
19
crates/nlp/Cargo.toml
Normal file
19
crates/nlp/Cargo.toml
Normal file
|
@ -0,0 +1,19 @@
|
|||
[package]
|
||||
name = "nlp"
|
||||
version = "0.3.9"
|
||||
edition = "2021"
|
||||
resolver = "2"
|
||||
|
||||
[dependencies]
|
||||
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
|
||||
farmhash = "1.1.5"
|
||||
siphasher = "1.0"
|
||||
serde = { version = "1.0", features = ["derive"]}
|
||||
bincode = "1.3.3"
|
||||
nohash = "0.2.0"
|
||||
ahash = "0.8.3"
|
||||
lazy_static = "1.4"
|
||||
whatlang = "0.16" # Language detection
|
||||
rust-stemmers = "1.2" # Stemmers
|
||||
tinysegmenter = "0.1" # Japanese tokenizer
|
||||
jieba-rs = "0.6" # Chinese stemmer
|
77
crates/nlp/src/bayes/bloom.rs
Normal file
77
crates/nlp/src/bayes/bloom.rs
Normal file
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use nohash::IsEnabled;
|
||||
|
||||
use crate::transformers::osb::{Gram, OsbToken};
|
||||
|
||||
use super::TokenHash;
|
||||
|
||||
pub struct BloomHasher<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> {
|
||||
buf: Vec<u8>,
|
||||
tokens: T,
|
||||
}
|
||||
|
||||
impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> BloomHasher<'x, T> {
|
||||
pub fn new(tokens: T) -> Self {
|
||||
Self {
|
||||
buf: Vec::with_capacity(64),
|
||||
tokens,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> Iterator for BloomHasher<'x, T> {
|
||||
type Item = OsbToken<TokenHash>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.tokens.next().map(|token| {
|
||||
let bytes = match token.inner {
|
||||
Gram::Uni { t1 } => t1.as_bytes(),
|
||||
Gram::Bi { t1, t2, .. } => {
|
||||
self.buf.clear();
|
||||
self.buf.extend_from_slice(t1.as_bytes());
|
||||
self.buf.push(b' ');
|
||||
self.buf.extend_from_slice(t2.as_bytes());
|
||||
&self.buf
|
||||
}
|
||||
};
|
||||
|
||||
OsbToken {
|
||||
inner: TokenHash {
|
||||
h1: xxhash_rust::xxh3::xxh3_64(bytes),
|
||||
h2: farmhash::hash64(bytes),
|
||||
},
|
||||
idx: token.idx,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::hash::Hash for TokenHash {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
state.write_u64(self.h1 ^ self.h2);
|
||||
}
|
||||
}
|
||||
|
||||
impl IsEnabled for TokenHash {}
|
167
crates/nlp/src/bayes/classify.rs
Normal file
167
crates/nlp/src/bayes/classify.rs
Normal file
|
@ -0,0 +1,167 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use crate::transformers::osb::OsbToken;
|
||||
|
||||
use super::{BayesClassifier, Weights};
|
||||
|
||||
// Position 0 represents Unigram weights
|
||||
const FEATURE_WEIGHT: [f64; 8] = [1.0, 3125.0, 256.0, 27.0, 1.0, 0.0, 0.0, 0.0];
|
||||
|
||||
impl BayesClassifier {
|
||||
pub fn classify<T>(&self, tokens: T, ham_learns: u32, spam_learns: u32) -> Option<f64>
|
||||
where
|
||||
T: Iterator<Item = OsbToken<Weights>>,
|
||||
{
|
||||
if self.min_learns > 0 && (spam_learns < self.min_learns || ham_learns < self.min_learns) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut processed_tokens = 0;
|
||||
let mut total_spam_prob = 0.0;
|
||||
let mut total_ham_prob = 0.0;
|
||||
|
||||
for token in tokens {
|
||||
let weights = token.inner;
|
||||
let total_count = weights.spam + weights.ham;
|
||||
|
||||
if total_count >= self.min_token_hits {
|
||||
let total_count = total_count as f64;
|
||||
let spam_freq = weights.spam as f64 / f64::max(1.0, spam_learns as f64);
|
||||
let ham_freq = weights.ham as f64 / f64::max(1.0, ham_learns as f64);
|
||||
let spam_prob = spam_freq / (spam_freq + ham_freq);
|
||||
let ham_prob = ham_freq / (spam_freq + ham_freq);
|
||||
|
||||
let fw = FEATURE_WEIGHT[token.idx];
|
||||
let w = (fw * total_count) / (1.0 + fw * total_count);
|
||||
let bayes_spam_prob = prob_combine(spam_prob, total_count, w, 0.5);
|
||||
|
||||
if !((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + self.min_prob_strength)
|
||||
|| (bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - self.min_prob_strength))
|
||||
{
|
||||
let bayes_ham_prob = prob_combine(ham_prob, total_count, w, 0.5);
|
||||
total_spam_prob += bayes_spam_prob.ln();
|
||||
total_ham_prob += bayes_ham_prob.ln();
|
||||
processed_tokens += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if processed_tokens == 0
|
||||
|| self.min_tokens > 0 && processed_tokens < (self.min_tokens as f64 * 0.1) as u32
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let (h, s) = if total_spam_prob > -300.0 && total_ham_prob > -300.0 {
|
||||
/* Fisher value is low enough to apply inv_chi_square */
|
||||
(
|
||||
1.0 - inv_chi_square(total_spam_prob, processed_tokens),
|
||||
1.0 - inv_chi_square(total_ham_prob, processed_tokens),
|
||||
)
|
||||
} else {
|
||||
/* Use naive method */
|
||||
if total_spam_prob < total_ham_prob {
|
||||
let h = (1.0 - (total_spam_prob - total_ham_prob).exp())
|
||||
/ (1.0 + (total_spam_prob - total_ham_prob).exp());
|
||||
(h, 1.0 - h)
|
||||
} else {
|
||||
let s = (1.0 - (total_ham_prob - total_spam_prob).exp())
|
||||
/ (1.0 + (total_ham_prob - total_spam_prob).exp());
|
||||
(1.0 - s, s)
|
||||
}
|
||||
};
|
||||
|
||||
let final_prob = if h.is_finite() && s.is_finite() {
|
||||
(s + 1.0 - h) / 2.0
|
||||
} else {
|
||||
/*
|
||||
* We have some overflow, hence we need to check which class
|
||||
* is NaN
|
||||
*/
|
||||
|
||||
if h.is_finite() {
|
||||
1.0
|
||||
} else if s.is_finite() {
|
||||
0.0
|
||||
} else {
|
||||
0.5
|
||||
}
|
||||
};
|
||||
|
||||
if processed_tokens > 0 && (final_prob - 0.5).abs() > 0.05 {
|
||||
Some(final_prob)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns probability of chisquare > value with specified number of freedom
|
||||
* degrees
|
||||
*/
|
||||
#[inline(always)]
|
||||
fn inv_chi_square(value: f64, freedom_deg: u32) -> f64 {
|
||||
let mut prob = value.exp();
|
||||
|
||||
if prob.is_finite() {
|
||||
/*
|
||||
* m is our confidence in class
|
||||
* prob is e ^ x (small value since x is normally less than zero
|
||||
* So we integrate over degrees of freedom and produce the total result
|
||||
* from 1.0 (no confidence) to 0.0 (full confidence)
|
||||
*/
|
||||
|
||||
let mut sum = prob;
|
||||
let m = -value;
|
||||
|
||||
for i in 1..freedom_deg {
|
||||
prob *= m / i as f64;
|
||||
sum += prob;
|
||||
}
|
||||
|
||||
f64::min(1.0, sum)
|
||||
} else {
|
||||
/*
|
||||
* e^x where x is large *NEGATIVE* number is OK, so we have a very strong
|
||||
* confidence that inv-chi-square is close to zero
|
||||
*/
|
||||
|
||||
if value < 0.0 {
|
||||
0.0
|
||||
} else {
|
||||
1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*#[inline(always)]
|
||||
fn normalize_probability(x: f64, bias: f64) -> f64 {
|
||||
((x - bias) * 2.0).powi(8)
|
||||
}*/
|
||||
|
||||
#[inline(always)]
|
||||
fn prob_combine(prob: f64, cnt: f64, weight: f64, assumed: f64) -> f64 {
|
||||
((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt))
|
||||
}
|
75
crates/nlp/src/bayes/mod.rs
Normal file
75
crates/nlp/src/bayes/mod.rs
Normal file
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::{collections::HashMap, hash::BuildHasherDefault};
|
||||
|
||||
use nohash::NoHashHasher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub mod bloom;
|
||||
pub mod classify;
|
||||
pub mod train;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
pub struct BayesModel {
|
||||
pub weights: HashMap<TokenHash, Weights, BuildHasherDefault<NoHashHasher<TokenHash>>>,
|
||||
pub spam_learns: u32,
|
||||
pub ham_learns: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct BayesClassifier {
|
||||
pub min_token_hits: u32,
|
||||
pub min_tokens: u32,
|
||||
pub min_prob_strength: f64,
|
||||
pub min_learns: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone, PartialEq, Eq)]
|
||||
pub struct TokenHash {
|
||||
h1: u64,
|
||||
h2: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone)]
|
||||
pub struct Weights {
|
||||
spam: u32,
|
||||
ham: u32,
|
||||
}
|
||||
|
||||
impl BayesClassifier {
|
||||
pub fn new() -> Self {
|
||||
BayesClassifier {
|
||||
min_token_hits: 2,
|
||||
min_tokens: 11,
|
||||
min_prob_strength: 0.05,
|
||||
min_learns: 200,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BayesClassifier {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
68
crates/nlp/src/bayes/train.rs
Normal file
68
crates/nlp/src/bayes/train.rs
Normal file
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use crate::transformers::osb::OsbToken;
|
||||
|
||||
use super::{BayesModel, TokenHash};
|
||||
|
||||
impl BayesModel {
|
||||
pub fn train<T>(&mut self, tokens: T, is_spam: bool)
|
||||
where
|
||||
T: IntoIterator<Item = OsbToken<TokenHash>>,
|
||||
{
|
||||
if is_spam {
|
||||
self.spam_learns += 1;
|
||||
} else {
|
||||
self.ham_learns += 1;
|
||||
}
|
||||
|
||||
for token in tokens {
|
||||
let hs = self.weights.entry(token.inner).or_default();
|
||||
if is_spam {
|
||||
hs.spam += 1;
|
||||
} else {
|
||||
hs.ham += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn untrain<T>(&mut self, tokens: T, is_spam: bool)
|
||||
where
|
||||
T: IntoIterator<Item = OsbToken<TokenHash>>,
|
||||
{
|
||||
if is_spam {
|
||||
self.spam_learns -= 1;
|
||||
} else {
|
||||
self.ham_learns -= 1;
|
||||
}
|
||||
|
||||
for token in tokens {
|
||||
let hs = self.weights.entry(token.inner).or_default();
|
||||
if is_spam {
|
||||
hs.spam -= 1;
|
||||
} else {
|
||||
hs.ham -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
202
crates/nlp/src/language/mod.rs
Normal file
202
crates/nlp/src/language/mod.rs
Normal file
|
@ -0,0 +1,202 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
use crate::tokenizers::{
|
||||
chinese::ChineseTokenizer, japanese::JapaneseTokenizer, word::WordTokenizer, Token,
|
||||
};
|
||||
|
||||
use self::detect::LanguageDetector;
|
||||
|
||||
pub mod detect;
|
||||
pub mod stemmer;
|
||||
|
||||
pub type LanguageTokenizer<'x> = Box<dyn Iterator<Item = Token<Cow<'x, str>>> + 'x>;
|
||||
|
||||
impl Language {
|
||||
pub fn tokenize_text<'x>(
|
||||
&self,
|
||||
text: &'x str,
|
||||
max_token_length: usize,
|
||||
) -> LanguageTokenizer<'x> {
|
||||
match self {
|
||||
Language::Japanese => Box::new(
|
||||
JapaneseTokenizer::new(WordTokenizer::new(text, usize::MAX))
|
||||
.filter(move |t| t.word.len() <= max_token_length),
|
||||
),
|
||||
Language::Mandarin => Box::new(
|
||||
ChineseTokenizer::new(WordTokenizer::new(text, usize::MAX))
|
||||
.filter(move |t| t.word.len() <= max_token_length),
|
||||
),
|
||||
_ => Box::new(WordTokenizer::new(text, max_token_length)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum Language {
|
||||
Esperanto = 0,
|
||||
English = 1,
|
||||
Russian = 2,
|
||||
Mandarin = 3,
|
||||
Spanish = 4,
|
||||
Portuguese = 5,
|
||||
Italian = 6,
|
||||
Bengali = 7,
|
||||
French = 8,
|
||||
German = 9,
|
||||
Ukrainian = 10,
|
||||
Georgian = 11,
|
||||
Arabic = 12,
|
||||
Hindi = 13,
|
||||
Japanese = 14,
|
||||
Hebrew = 15,
|
||||
Yiddish = 16,
|
||||
Polish = 17,
|
||||
Amharic = 18,
|
||||
Javanese = 19,
|
||||
Korean = 20,
|
||||
Bokmal = 21,
|
||||
Danish = 22,
|
||||
Swedish = 23,
|
||||
Finnish = 24,
|
||||
Turkish = 25,
|
||||
Dutch = 26,
|
||||
Hungarian = 27,
|
||||
Czech = 28,
|
||||
Greek = 29,
|
||||
Bulgarian = 30,
|
||||
Belarusian = 31,
|
||||
Marathi = 32,
|
||||
Kannada = 33,
|
||||
Romanian = 34,
|
||||
Slovene = 35,
|
||||
Croatian = 36,
|
||||
Serbian = 37,
|
||||
Macedonian = 38,
|
||||
Lithuanian = 39,
|
||||
Latvian = 40,
|
||||
Estonian = 41,
|
||||
Tamil = 42,
|
||||
Vietnamese = 43,
|
||||
Urdu = 44,
|
||||
Thai = 45,
|
||||
Gujarati = 46,
|
||||
Uzbek = 47,
|
||||
Punjabi = 48,
|
||||
Azerbaijani = 49,
|
||||
Indonesian = 50,
|
||||
Telugu = 51,
|
||||
Persian = 52,
|
||||
Malayalam = 53,
|
||||
Oriya = 54,
|
||||
Burmese = 55,
|
||||
Nepali = 56,
|
||||
Sinhalese = 57,
|
||||
Khmer = 58,
|
||||
Turkmen = 59,
|
||||
Akan = 60,
|
||||
Zulu = 61,
|
||||
Shona = 62,
|
||||
Afrikaans = 63,
|
||||
Latin = 64,
|
||||
Slovak = 65,
|
||||
Catalan = 66,
|
||||
Tagalog = 67,
|
||||
Armenian = 68,
|
||||
Unknown = 69,
|
||||
None = 70,
|
||||
}
|
||||
|
||||
impl Language {
|
||||
pub fn from_iso_639(code: &str) -> Option<Self> {
|
||||
match code.split_once('-').map(|c| c.0).unwrap_or(code) {
|
||||
"en" => Language::English,
|
||||
"es" => Language::Spanish,
|
||||
"pt" => Language::Portuguese,
|
||||
"it" => Language::Italian,
|
||||
"fr" => Language::French,
|
||||
"de" => Language::German,
|
||||
"ru" => Language::Russian,
|
||||
"zh" => Language::Mandarin,
|
||||
"ja" => Language::Japanese,
|
||||
"ar" => Language::Arabic,
|
||||
"hi" => Language::Hindi,
|
||||
"ko" => Language::Korean,
|
||||
"bn" => Language::Bengali,
|
||||
"he" => Language::Hebrew,
|
||||
"ur" => Language::Urdu,
|
||||
"fa" => Language::Persian,
|
||||
"ml" => Language::Malayalam,
|
||||
"or" => Language::Oriya,
|
||||
"my" => Language::Burmese,
|
||||
"ne" => Language::Nepali,
|
||||
"si" => Language::Sinhalese,
|
||||
"km" => Language::Khmer,
|
||||
"tk" => Language::Turkmen,
|
||||
"am" => Language::Amharic,
|
||||
"az" => Language::Azerbaijani,
|
||||
"id" => Language::Indonesian,
|
||||
"te" => Language::Telugu,
|
||||
"ta" => Language::Tamil,
|
||||
"vi" => Language::Vietnamese,
|
||||
"gu" => Language::Gujarati,
|
||||
"pa" => Language::Punjabi,
|
||||
"uz" => Language::Uzbek,
|
||||
"hy" => Language::Armenian,
|
||||
"ka" => Language::Georgian,
|
||||
"la" => Language::Latin,
|
||||
"sl" => Language::Slovene,
|
||||
"hr" => Language::Croatian,
|
||||
"sr" => Language::Serbian,
|
||||
"mk" => Language::Macedonian,
|
||||
"lt" => Language::Lithuanian,
|
||||
"lv" => Language::Latvian,
|
||||
"et" => Language::Estonian,
|
||||
"tl" => Language::Tagalog,
|
||||
"af" => Language::Afrikaans,
|
||||
"zu" => Language::Zulu,
|
||||
"sn" => Language::Shona,
|
||||
"ak" => Language::Akan,
|
||||
_ => return None,
|
||||
}
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl Language {
|
||||
pub fn detect(text: String, default: Language) -> (String, Language) {
|
||||
if let Some((l, t)) = text
|
||||
.split_once(':')
|
||||
.and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
|
||||
{
|
||||
(t.to_string(), l)
|
||||
} else {
|
||||
let l = LanguageDetector::detect_single(&text)
|
||||
.and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
|
||||
.unwrap_or(default);
|
||||
(text, l)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -25,25 +25,25 @@ use std::borrow::Cow;
|
|||
|
||||
use rust_stemmers::Algorithm;
|
||||
|
||||
use super::{tokenizers::Tokenizer, Language};
|
||||
use super::{Language, LanguageTokenizer};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct StemmedToken<'x> {
|
||||
pub word: Cow<'x, str>,
|
||||
pub stemmed_word: Option<Cow<'x, str>>,
|
||||
pub offset: u32, // Word offset in the text part
|
||||
pub len: u8, // Word length
|
||||
pub from: usize, // Word offset in the text part
|
||||
pub to: usize, // Word length
|
||||
}
|
||||
|
||||
pub struct Stemmer<'x> {
|
||||
stemmer: Option<rust_stemmers::Stemmer>,
|
||||
tokenizer: Tokenizer<'x>,
|
||||
tokenizer: LanguageTokenizer<'x>,
|
||||
}
|
||||
|
||||
impl<'x> Stemmer<'x> {
|
||||
pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Stemmer<'x> {
|
||||
Stemmer {
|
||||
tokenizer: Tokenizer::new(text, language, max_token_length),
|
||||
tokenizer: language.tokenize_text(text, max_token_length),
|
||||
stemmer: STEMMER_MAP[language as usize].map(rust_stemmers::Stemmer::create),
|
||||
}
|
||||
}
|
||||
|
@ -57,15 +57,15 @@ impl<'x> Iterator for Stemmer<'x> {
|
|||
Some(StemmedToken {
|
||||
stemmed_word: self.stemmer.as_ref().and_then(|stemmer| {
|
||||
match stemmer.stem(&token.word) {
|
||||
Cow::Owned(text) if text.len() != token.len as usize || text != token.word => {
|
||||
Cow::Owned(text) if text.len() != token.word.len() || text != token.word => {
|
||||
Some(text.into())
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}),
|
||||
word: token.word,
|
||||
offset: token.offset,
|
||||
len: token.len,
|
||||
from: token.from,
|
||||
to: token.to,
|
||||
})
|
||||
}
|
||||
}
|
78
crates/nlp/src/lib.rs
Normal file
78
crates/nlp/src/lib.rs
Normal file
|
@ -0,0 +1,78 @@
|
|||
use ahash::AHashSet;
|
||||
|
||||
pub mod bayes;
|
||||
pub mod language;
|
||||
pub mod tokenizers;
|
||||
pub mod transformers;
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PublicSuffix {
|
||||
pub suffixes: AHashSet<String>,
|
||||
pub exceptions: AHashSet<String>,
|
||||
pub wildcards: Vec<String>,
|
||||
}
|
||||
|
||||
impl PublicSuffix {
|
||||
pub fn contains(&self, suffix: &str) -> bool {
|
||||
self.suffixes.contains(suffix)
|
||||
|| (!self.exceptions.contains(suffix)
|
||||
&& self.wildcards.iter().any(|w| suffix.ends_with(w)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::fs;
|
||||
|
||||
use crate::{
|
||||
bayes::{bloom::BloomHasher, BayesClassifier, BayesModel},
|
||||
transformers::osb::{OsbToken, OsbTokenizer},
|
||||
};
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn train() {
|
||||
let db = fs::read_to_string("spam_or_not_spam.csv").unwrap();
|
||||
let mut bayes = BayesModel::default();
|
||||
|
||||
for line in db.lines() {
|
||||
let (text, is_spam) = line.rsplit_once(',').unwrap();
|
||||
let is_spam = is_spam == "1";
|
||||
|
||||
bayes.train(
|
||||
BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)),
|
||||
is_spam,
|
||||
);
|
||||
}
|
||||
println!("Ham: {} Spam: {}", bayes.ham_learns, bayes.spam_learns,);
|
||||
fs::write("spam_or_not_spam.bin", bincode::serialize(&bayes).unwrap()).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn classify() {
|
||||
let model: BayesModel =
|
||||
bincode::deserialize(&fs::read("spam_or_not_spam.bin").unwrap()).unwrap();
|
||||
let bayes = BayesClassifier::new();
|
||||
|
||||
for text in [
|
||||
"i am attaching to this email a presentation to integrate the spreadsheet into our server",
|
||||
"buy this great product special offer sales",
|
||||
"i m using simple dns from jhsoft we support only a few web sites and i d like to swap secondary services with someone in a similar position",
|
||||
"viagra xenical vioxx zyban propecia we only offer the real viagra xenical ",
|
||||
] {
|
||||
println!(
|
||||
"{:?} -> {}",
|
||||
text,
|
||||
bayes
|
||||
.classify(BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)).filter_map(|x| model.weights.get(&x.inner).map(|w| {
|
||||
OsbToken {
|
||||
idx: x.idx,
|
||||
inner: *w,
|
||||
}
|
||||
})), model.ham_learns, model.spam_learns)
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
222
crates/nlp/src/tokenizers/chinese.rs
Normal file
222
crates/nlp/src/tokenizers/chinese.rs
Normal file
|
@ -0,0 +1,222 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::{borrow::Cow, vec::IntoIter};
|
||||
|
||||
use jieba_rs::Jieba;
|
||||
|
||||
use super::{InnerToken, Token};
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
lazy_static! {
|
||||
static ref JIEBA: Jieba = Jieba::new();
|
||||
}
|
||||
|
||||
pub struct ChineseTokenizer<'x, T, I>
|
||||
where
|
||||
T: Iterator<Item = Token<I>>,
|
||||
I: InnerToken<'x>,
|
||||
{
|
||||
tokenizer: T,
|
||||
tokens: IntoIter<Token<I>>,
|
||||
phantom: std::marker::PhantomData<&'x str>,
|
||||
}
|
||||
|
||||
impl<'x, T, I> ChineseTokenizer<'x, T, I>
|
||||
where
|
||||
T: Iterator<Item = Token<I>>,
|
||||
I: InnerToken<'x>,
|
||||
{
|
||||
pub fn new(tokenizer: T) -> Self {
|
||||
ChineseTokenizer {
|
||||
tokenizer,
|
||||
tokens: Vec::new().into_iter(),
|
||||
phantom: std::marker::PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x, T, I> Iterator for ChineseTokenizer<'x, T, I>
|
||||
where
|
||||
T: Iterator<Item = Token<I>>,
|
||||
I: InnerToken<'x>,
|
||||
{
|
||||
type Item = Token<I>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some(token) = self.tokens.next() {
|
||||
return Some(token);
|
||||
} else {
|
||||
let token = self.tokenizer.next()?;
|
||||
if token.word.is_alphabetic_8bit() {
|
||||
let mut token_to = token.from;
|
||||
match token.word.unwrap_alphabetic() {
|
||||
Cow::Borrowed(word) => {
|
||||
self.tokens = JIEBA
|
||||
.cut(word, false)
|
||||
.into_iter()
|
||||
.map(|word| {
|
||||
let token_from = token_to;
|
||||
token_to += word.len();
|
||||
Token {
|
||||
word: I::new_alphabetic(word),
|
||||
from: token_from,
|
||||
to: token_to,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter();
|
||||
}
|
||||
Cow::Owned(word) => {
|
||||
self.tokens = JIEBA
|
||||
.cut(&word, false)
|
||||
.into_iter()
|
||||
.map(|word| {
|
||||
let token_from = token_to;
|
||||
token_to += word.len();
|
||||
Token {
|
||||
word: I::new_alphabetic(word.to_string()),
|
||||
from: token_from,
|
||||
to: token_to,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return token.into();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tokenizers::{chinese::ChineseTokenizer, word::WordTokenizer, Token};
|
||||
|
||||
#[test]
|
||||
fn chinese_tokenizer() {
|
||||
assert_eq!(
|
||||
ChineseTokenizer::new(WordTokenizer::new(
|
||||
"孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。",
|
||||
40
|
||||
),)
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
Token {
|
||||
word: "孫".into(),
|
||||
from: 0,
|
||||
to: 3
|
||||
},
|
||||
Token {
|
||||
word: "子".into(),
|
||||
from: 3,
|
||||
to: 6
|
||||
},
|
||||
Token {
|
||||
word: "曰".into(),
|
||||
from: 6,
|
||||
to: 9
|
||||
},
|
||||
Token {
|
||||
word: "兵".into(),
|
||||
from: 12,
|
||||
to: 15
|
||||
},
|
||||
Token {
|
||||
word: "者".into(),
|
||||
from: 15,
|
||||
to: 18
|
||||
},
|
||||
Token {
|
||||
word: "國".into(),
|
||||
from: 21,
|
||||
to: 24
|
||||
},
|
||||
Token {
|
||||
word: "之".into(),
|
||||
from: 24,
|
||||
to: 27
|
||||
},
|
||||
Token {
|
||||
word: "大事".into(),
|
||||
from: 27,
|
||||
to: 33
|
||||
},
|
||||
Token {
|
||||
word: "死".into(),
|
||||
from: 36,
|
||||
to: 39
|
||||
},
|
||||
Token {
|
||||
word: "生".into(),
|
||||
from: 39,
|
||||
to: 42
|
||||
},
|
||||
Token {
|
||||
word: "之".into(),
|
||||
from: 42,
|
||||
to: 45
|
||||
},
|
||||
Token {
|
||||
word: "地".into(),
|
||||
from: 45,
|
||||
to: 48
|
||||
},
|
||||
Token {
|
||||
word: "存亡".into(),
|
||||
from: 51,
|
||||
to: 57
|
||||
},
|
||||
Token {
|
||||
word: "之".into(),
|
||||
from: 57,
|
||||
to: 60
|
||||
},
|
||||
Token {
|
||||
word: "道".into(),
|
||||
from: 60,
|
||||
to: 63
|
||||
},
|
||||
Token {
|
||||
word: "不可不".into(),
|
||||
from: 66,
|
||||
to: 75
|
||||
},
|
||||
Token {
|
||||
word: "察".into(),
|
||||
from: 75,
|
||||
to: 78
|
||||
},
|
||||
Token {
|
||||
word: "也".into(),
|
||||
from: 78,
|
||||
to: 81
|
||||
}
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
179
crates/nlp/src/tokenizers/japanese.rs
Normal file
179
crates/nlp/src/tokenizers/japanese.rs
Normal file
|
@ -0,0 +1,179 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::vec::IntoIter;
|
||||
|
||||
use super::{InnerToken, Token};
|
||||
|
||||
pub struct JapaneseTokenizer<'x, T, I>
|
||||
where
|
||||
T: Iterator<Item = Token<I>>,
|
||||
I: InnerToken<'x>,
|
||||
{
|
||||
tokenizer: T,
|
||||
tokens: IntoIter<Token<I>>,
|
||||
phantom: std::marker::PhantomData<&'x str>,
|
||||
}
|
||||
|
||||
impl<'x, T, I> JapaneseTokenizer<'x, T, I>
|
||||
where
|
||||
T: Iterator<Item = Token<I>>,
|
||||
I: InnerToken<'x>,
|
||||
{
|
||||
pub fn new(tokenizer: T) -> Self {
|
||||
JapaneseTokenizer {
|
||||
tokenizer,
|
||||
tokens: Vec::new().into_iter(),
|
||||
phantom: std::marker::PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x, T, I> Iterator for JapaneseTokenizer<'x, T, I>
|
||||
where
|
||||
T: Iterator<Item = Token<I>>,
|
||||
I: InnerToken<'x>,
|
||||
{
|
||||
type Item = Token<I>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some(token) = self.tokens.next() {
|
||||
return Some(token);
|
||||
} else {
|
||||
let token = self.tokenizer.next()?;
|
||||
if token.word.is_alphabetic_8bit() {
|
||||
let mut token_to = token.from;
|
||||
self.tokens = tinysegmenter::tokenize(token.word.unwrap_alphabetic().as_ref())
|
||||
.into_iter()
|
||||
.map(|word| {
|
||||
let token_from = token_to;
|
||||
token_to += word.len();
|
||||
Token {
|
||||
word: I::new_alphabetic(word.to_string()),
|
||||
from: token_from,
|
||||
to: token_to,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter();
|
||||
} else {
|
||||
return token.into();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tokenizers::{japanese::JapaneseTokenizer, word::WordTokenizer, Token};
|
||||
|
||||
#[test]
|
||||
fn japanese_tokenizer() {
|
||||
assert_eq!(
|
||||
JapaneseTokenizer::new(WordTokenizer::new(
|
||||
"お先に失礼します あなたの名前は何ですか 123 abc-872",
|
||||
40
|
||||
))
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
Token {
|
||||
word: "お先".into(),
|
||||
from: 0,
|
||||
to: 6
|
||||
},
|
||||
Token {
|
||||
word: "に".into(),
|
||||
from: 6,
|
||||
to: 9
|
||||
},
|
||||
Token {
|
||||
word: "失礼".into(),
|
||||
from: 9,
|
||||
to: 15
|
||||
},
|
||||
Token {
|
||||
word: "し".into(),
|
||||
from: 15,
|
||||
to: 18
|
||||
},
|
||||
Token {
|
||||
word: "ます".into(),
|
||||
from: 18,
|
||||
to: 24
|
||||
},
|
||||
Token {
|
||||
word: "あなた".into(),
|
||||
from: 25,
|
||||
to: 34
|
||||
},
|
||||
Token {
|
||||
word: "の".into(),
|
||||
from: 34,
|
||||
to: 37
|
||||
},
|
||||
Token {
|
||||
word: "名前".into(),
|
||||
from: 37,
|
||||
to: 43
|
||||
},
|
||||
Token {
|
||||
word: "は".into(),
|
||||
from: 43,
|
||||
to: 46
|
||||
},
|
||||
Token {
|
||||
word: "何".into(),
|
||||
from: 46,
|
||||
to: 49
|
||||
},
|
||||
Token {
|
||||
word: "です".into(),
|
||||
from: 49,
|
||||
to: 55
|
||||
},
|
||||
Token {
|
||||
word: "か".into(),
|
||||
from: 55,
|
||||
to: 58
|
||||
},
|
||||
Token {
|
||||
word: "123".into(),
|
||||
from: 59,
|
||||
to: 62
|
||||
},
|
||||
Token {
|
||||
word: "abc".into(),
|
||||
from: 63,
|
||||
to: 66
|
||||
},
|
||||
Token {
|
||||
word: "872".into(),
|
||||
from: 67,
|
||||
to: 70
|
||||
}
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
74
crates/nlp/src/tokenizers/mod.rs
Normal file
74
crates/nlp/src/tokenizers/mod.rs
Normal file
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
pub mod chinese;
|
||||
pub mod japanese;
|
||||
pub mod space;
|
||||
pub mod types;
|
||||
pub mod word;
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Token<T> {
|
||||
pub word: T,
|
||||
pub from: usize,
|
||||
pub to: usize,
|
||||
}
|
||||
|
||||
pub trait InnerToken<'x>: Sized {
|
||||
fn new_alphabetic(value: impl Into<Cow<'x, str>>) -> Self;
|
||||
fn unwrap_alphabetic(self) -> Cow<'x, str>;
|
||||
fn is_alphabetic(&self) -> bool;
|
||||
fn is_alphabetic_8bit(&self) -> bool;
|
||||
}
|
||||
|
||||
impl<'x> InnerToken<'x> for Cow<'x, str> {
|
||||
fn new_alphabetic(value: impl Into<Cow<'x, str>>) -> Self {
|
||||
value.into()
|
||||
}
|
||||
|
||||
fn is_alphabetic(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn is_alphabetic_8bit(&self) -> bool {
|
||||
!self.chars().all(|c| c.is_ascii())
|
||||
}
|
||||
|
||||
fn unwrap_alphabetic(self) -> Cow<'x, str> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Token<T> {
|
||||
pub fn new(offset: usize, len: usize, word: T) -> Token<T> {
|
||||
debug_assert!(offset <= u32::max_value() as usize);
|
||||
debug_assert!(len <= u8::max_value() as usize);
|
||||
Token {
|
||||
from: offset,
|
||||
to: offset + len,
|
||||
word,
|
||||
}
|
||||
}
|
||||
}
|
2878
crates/nlp/src/tokenizers/types.rs
Normal file
2878
crates/nlp/src/tokenizers/types.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -21,19 +21,19 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use std::str::CharIndices;
|
||||
use std::{borrow::Cow, str::CharIndices};
|
||||
|
||||
use super::Token;
|
||||
|
||||
pub struct IndoEuropeanTokenizer<'x> {
|
||||
pub struct WordTokenizer<'x> {
|
||||
max_token_length: usize,
|
||||
text: &'x str,
|
||||
iterator: CharIndices<'x>,
|
||||
}
|
||||
|
||||
impl<'x> IndoEuropeanTokenizer<'x> {
|
||||
pub fn new(text: &str, max_token_length: usize) -> IndoEuropeanTokenizer {
|
||||
IndoEuropeanTokenizer {
|
||||
impl<'x> WordTokenizer<'x> {
|
||||
pub fn new(text: &str, max_token_length: usize) -> WordTokenizer {
|
||||
WordTokenizer {
|
||||
max_token_length,
|
||||
text,
|
||||
iterator: text.char_indices(),
|
||||
|
@ -42,8 +42,8 @@ impl<'x> IndoEuropeanTokenizer<'x> {
|
|||
}
|
||||
|
||||
/// Parses indo-european text into lowercase tokens.
|
||||
impl<'x> Iterator for IndoEuropeanTokenizer<'x> {
|
||||
type Item = Token<'x>;
|
||||
impl<'x> Iterator for WordTokenizer<'x> {
|
||||
type Item = Token<Cow<'x, str>>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
while let Some((token_start, ch)) = self.iterator.next() {
|
||||
|
@ -159,7 +159,7 @@ mod tests {
|
|||
];
|
||||
|
||||
for (input, tokens) in inputs.iter() {
|
||||
for (pos, token) in IndoEuropeanTokenizer::new(input, 40).enumerate() {
|
||||
for (pos, token) in WordTokenizer::new(input, 40).enumerate() {
|
||||
assert_eq!(token, tokens[pos]);
|
||||
}
|
||||
}
|
|
@ -21,41 +21,4 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
use super::bloom::{BloomFilter, BloomHashGroup};
|
||||
|
||||
pub trait ToNgrams: Sized {
|
||||
fn new(items: usize) -> Self;
|
||||
fn insert(&mut self, item: &str);
|
||||
fn to_ngrams(tokens: &[Cow<'_, str>], n: usize) -> Self {
|
||||
let mut filter = Self::new(tokens.len().saturating_sub(1));
|
||||
for words in tokens.windows(n) {
|
||||
filter.insert(&words.join(" "));
|
||||
}
|
||||
filter
|
||||
}
|
||||
}
|
||||
|
||||
impl ToNgrams for BloomFilter {
|
||||
fn new(items: usize) -> Self {
|
||||
BloomFilter::new(items)
|
||||
}
|
||||
|
||||
fn insert(&mut self, item: &str) {
|
||||
self.insert(&item.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl ToNgrams for Vec<BloomHashGroup> {
|
||||
fn new(items: usize) -> Self {
|
||||
Vec::with_capacity(items)
|
||||
}
|
||||
|
||||
fn insert(&mut self, item: &str) {
|
||||
self.push(BloomHashGroup {
|
||||
h1: item.into(),
|
||||
h2: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
pub mod osb;
|
467
crates/nlp/src/transformers/osb.rs
Normal file
467
crates/nlp/src/transformers/osb.rs
Normal file
|
@ -0,0 +1,467 @@
|
|||
/*
|
||||
* Copyright (c) 2023 Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of the Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::iter::Peekable;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct OsbToken<T> {
|
||||
pub inner: T,
|
||||
pub idx: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Gram<'x> {
|
||||
Uni { t1: &'x str },
|
||||
Bi { t1: &'x str, t2: &'x str },
|
||||
}
|
||||
|
||||
pub struct OsbTokenizer<'x, I>
|
||||
where
|
||||
I: Iterator<Item = &'x str>,
|
||||
{
|
||||
iter: Peekable<I>,
|
||||
buf: Vec<Option<&'x str>>,
|
||||
window_size: usize,
|
||||
window_pos: usize,
|
||||
window_idx: usize,
|
||||
}
|
||||
|
||||
impl<'x, I> OsbTokenizer<'x, I>
|
||||
where
|
||||
I: Iterator<Item = &'x str>,
|
||||
{
|
||||
pub fn new(iter: I, window_size: usize) -> Self {
|
||||
Self {
|
||||
iter: iter.peekable(),
|
||||
buf: vec![None; window_size],
|
||||
window_pos: 0,
|
||||
window_idx: 0,
|
||||
window_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x, I> Iterator for OsbTokenizer<'x, I>
|
||||
where
|
||||
I: Iterator<Item = &'x str>,
|
||||
{
|
||||
type Item = OsbToken<Gram<'x>>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let end_pos = (self.window_pos + self.window_idx) % self.window_size;
|
||||
if self.buf[end_pos].is_none() {
|
||||
self.buf[end_pos] = self.iter.next();
|
||||
}
|
||||
|
||||
let t1 = self.buf[self.window_pos % self.window_size]?;
|
||||
let token = OsbToken {
|
||||
inner: if self.window_idx != 0 {
|
||||
Gram::Bi {
|
||||
t1,
|
||||
t2: self.buf[end_pos]?,
|
||||
}
|
||||
} else {
|
||||
Gram::Uni { t1 }
|
||||
},
|
||||
idx: self.window_idx,
|
||||
};
|
||||
|
||||
// Increment window
|
||||
self.window_idx += 1;
|
||||
if self.window_idx == self.window_size
|
||||
|| (self.iter.peek().is_none()
|
||||
&& self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none())
|
||||
{
|
||||
self.buf[self.window_pos % self.window_size] = None;
|
||||
self.window_idx = 0;
|
||||
self.window_pos += 1;
|
||||
}
|
||||
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::transformers::osb::{Gram, OsbToken};
|
||||
|
||||
#[test]
|
||||
fn osb_tokenizer() {
|
||||
assert_eq!(
|
||||
super::OsbTokenizer::new(
|
||||
"The quick brown fox jumps over the lazy dog and the lazy cat"
|
||||
.split_ascii_whitespace(),
|
||||
5
|
||||
)
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "The" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "The",
|
||||
t2: "quick"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "The",
|
||||
t2: "brown"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "The",
|
||||
t2: "fox"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "The",
|
||||
t2: "jumps"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "quick" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "quick",
|
||||
t2: "brown"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "quick",
|
||||
t2: "fox"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "quick",
|
||||
t2: "jumps"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "quick",
|
||||
t2: "over"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "brown" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "brown",
|
||||
t2: "fox"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "brown",
|
||||
t2: "jumps"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "brown",
|
||||
t2: "over"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "brown",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "fox" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "fox",
|
||||
t2: "jumps"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "fox",
|
||||
t2: "over"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "fox",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "fox",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "jumps" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "jumps",
|
||||
t2: "over"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "jumps",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "jumps",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "jumps",
|
||||
t2: "dog"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "over" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "over",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "over",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "over",
|
||||
t2: "dog"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "over",
|
||||
t2: "and"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "the" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "dog"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "and"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "lazy" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "dog"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "and"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "dog" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "dog",
|
||||
t2: "and"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "dog",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "dog",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "dog",
|
||||
t2: "cat"
|
||||
},
|
||||
idx: 4
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "and" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "and",
|
||||
t2: "the"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "and",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "and",
|
||||
t2: "cat"
|
||||
},
|
||||
idx: 3
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "the" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "lazy"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "the",
|
||||
t2: "cat"
|
||||
},
|
||||
idx: 2
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "lazy" },
|
||||
idx: 0
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Bi {
|
||||
t1: "lazy",
|
||||
t2: "cat"
|
||||
},
|
||||
idx: 1
|
||||
},
|
||||
OsbToken {
|
||||
inner: Gram::Uni { t1: "cat" },
|
||||
idx: 0
|
||||
}
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
|
@ -6,6 +6,7 @@ resolver = "2"
|
|||
|
||||
[dependencies]
|
||||
utils = { path = "../utils" }
|
||||
nlp = { path = "../nlp" }
|
||||
maybe-async = { path = "../maybe-async" }
|
||||
rocksdb = { version = "0.20.1", optional = true }
|
||||
foundationdb = { version = "0.8.0", features = ["embedded-fdb-include"], optional = true }
|
||||
|
@ -21,13 +22,9 @@ serde = { version = "1.0", features = ["derive"]}
|
|||
ahash = { version = "0.8.0", features = ["serde"] }
|
||||
bitpacking = "0.8.4"
|
||||
lazy_static = "1.4"
|
||||
whatlang = "0.16" # Language detection
|
||||
rust-stemmers = "1.2" # Stemmers
|
||||
tinysegmenter = "0.1" # Japanese tokenizer
|
||||
jieba-rs = "0.6" # Chinese stemmer
|
||||
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
|
||||
farmhash = "1.1.5"
|
||||
siphasher = "0.3"
|
||||
siphasher = "1.0"
|
||||
parking_lot = "0.12.1"
|
||||
lru-cache = { version = "0.1.2", optional = true }
|
||||
num_cpus = { version = "1.15.0", optional = true }
|
||||
|
|
|
@ -27,13 +27,12 @@ use std::{
|
|||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
|
||||
use roaring::RoaringBitmap;
|
||||
use utils::codec::leb128::{Leb128Reader, Leb128Vec};
|
||||
|
||||
use crate::{Deserialize, Error, Serialize};
|
||||
|
||||
use super::{stemmer::StemmedToken, tokenizers::Token};
|
||||
|
||||
pub struct BloomFilter {
|
||||
m: u64,
|
||||
b: RoaringBitmap,
|
||||
|
@ -204,8 +203,8 @@ impl From<Cow<'_, str>> for BloomHash {
|
|||
}
|
||||
}
|
||||
|
||||
impl From<Token<'_>> for BloomHashGroup {
|
||||
fn from(t: Token<'_>) -> Self {
|
||||
impl From<Token<Cow<'_, str>>> for BloomHashGroup {
|
||||
fn from(t: Token<Cow<'_, str>>) -> Self {
|
||||
Self {
|
||||
h1: BloomHash::hash(t.word.as_ref()),
|
||||
h2: None,
|
||||
|
|
|
@ -24,6 +24,14 @@
|
|||
use std::{borrow::Cow, collections::HashSet};
|
||||
|
||||
use ahash::AHashSet;
|
||||
use nlp::{
|
||||
language::{
|
||||
detect::{LanguageDetector, MIN_LANGUAGE_SCORE},
|
||||
stemmer::Stemmer,
|
||||
Language,
|
||||
},
|
||||
tokenizers::{space::SpaceTokenizer, Token},
|
||||
};
|
||||
use utils::map::vec_map::VecMap;
|
||||
|
||||
use crate::{
|
||||
|
@ -32,13 +40,7 @@ use crate::{
|
|||
Serialize, HASH_EXACT, HASH_STEMMED,
|
||||
};
|
||||
|
||||
use super::{
|
||||
lang::{LanguageDetector, MIN_LANGUAGE_SCORE},
|
||||
stemmer::Stemmer,
|
||||
term_index::{TermIndexBuilder, TokenIndex},
|
||||
tokenizers::{space::SpaceTokenizer, Token},
|
||||
Language,
|
||||
};
|
||||
use super::term_index::{TermIndexBuilder, TokenIndex};
|
||||
|
||||
pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize;
|
||||
pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1;
|
||||
|
@ -138,8 +140,8 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
|
|||
ops.insert(Operation::hash(&token, HASH_EXACT, field, true));
|
||||
terms.push(term_index.add_token(Token {
|
||||
word: token.into(),
|
||||
offset: 0,
|
||||
len: 0,
|
||||
from: 0,
|
||||
to: 0,
|
||||
}));
|
||||
}
|
||||
term_index.add_terms(field, 0, terms);
|
||||
|
|
|
@ -26,149 +26,13 @@ use crate::{
|
|||
BitmapKey, Serialize, BM_HASH,
|
||||
};
|
||||
|
||||
use self::{bloom::hash_token, builder::MAX_TOKEN_MASK, lang::LanguageDetector};
|
||||
use self::{bloom::hash_token, builder::MAX_TOKEN_MASK};
|
||||
|
||||
pub mod lang;
|
||||
//pub mod pdf;
|
||||
pub mod bloom;
|
||||
pub mod builder;
|
||||
pub mod ngram;
|
||||
pub mod query;
|
||||
pub mod search_snippet;
|
||||
pub mod stemmer;
|
||||
pub mod term_index;
|
||||
pub mod tokenizers;
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum Language {
|
||||
Esperanto = 0,
|
||||
English = 1,
|
||||
Russian = 2,
|
||||
Mandarin = 3,
|
||||
Spanish = 4,
|
||||
Portuguese = 5,
|
||||
Italian = 6,
|
||||
Bengali = 7,
|
||||
French = 8,
|
||||
German = 9,
|
||||
Ukrainian = 10,
|
||||
Georgian = 11,
|
||||
Arabic = 12,
|
||||
Hindi = 13,
|
||||
Japanese = 14,
|
||||
Hebrew = 15,
|
||||
Yiddish = 16,
|
||||
Polish = 17,
|
||||
Amharic = 18,
|
||||
Javanese = 19,
|
||||
Korean = 20,
|
||||
Bokmal = 21,
|
||||
Danish = 22,
|
||||
Swedish = 23,
|
||||
Finnish = 24,
|
||||
Turkish = 25,
|
||||
Dutch = 26,
|
||||
Hungarian = 27,
|
||||
Czech = 28,
|
||||
Greek = 29,
|
||||
Bulgarian = 30,
|
||||
Belarusian = 31,
|
||||
Marathi = 32,
|
||||
Kannada = 33,
|
||||
Romanian = 34,
|
||||
Slovene = 35,
|
||||
Croatian = 36,
|
||||
Serbian = 37,
|
||||
Macedonian = 38,
|
||||
Lithuanian = 39,
|
||||
Latvian = 40,
|
||||
Estonian = 41,
|
||||
Tamil = 42,
|
||||
Vietnamese = 43,
|
||||
Urdu = 44,
|
||||
Thai = 45,
|
||||
Gujarati = 46,
|
||||
Uzbek = 47,
|
||||
Punjabi = 48,
|
||||
Azerbaijani = 49,
|
||||
Indonesian = 50,
|
||||
Telugu = 51,
|
||||
Persian = 52,
|
||||
Malayalam = 53,
|
||||
Oriya = 54,
|
||||
Burmese = 55,
|
||||
Nepali = 56,
|
||||
Sinhalese = 57,
|
||||
Khmer = 58,
|
||||
Turkmen = 59,
|
||||
Akan = 60,
|
||||
Zulu = 61,
|
||||
Shona = 62,
|
||||
Afrikaans = 63,
|
||||
Latin = 64,
|
||||
Slovak = 65,
|
||||
Catalan = 66,
|
||||
Tagalog = 67,
|
||||
Armenian = 68,
|
||||
Unknown = 69,
|
||||
None = 70,
|
||||
}
|
||||
|
||||
impl Language {
|
||||
pub fn from_iso_639(code: &str) -> Option<Self> {
|
||||
match code.split_once('-').map(|c| c.0).unwrap_or(code) {
|
||||
"en" => Language::English,
|
||||
"es" => Language::Spanish,
|
||||
"pt" => Language::Portuguese,
|
||||
"it" => Language::Italian,
|
||||
"fr" => Language::French,
|
||||
"de" => Language::German,
|
||||
"ru" => Language::Russian,
|
||||
"zh" => Language::Mandarin,
|
||||
"ja" => Language::Japanese,
|
||||
"ar" => Language::Arabic,
|
||||
"hi" => Language::Hindi,
|
||||
"ko" => Language::Korean,
|
||||
"bn" => Language::Bengali,
|
||||
"he" => Language::Hebrew,
|
||||
"ur" => Language::Urdu,
|
||||
"fa" => Language::Persian,
|
||||
"ml" => Language::Malayalam,
|
||||
"or" => Language::Oriya,
|
||||
"my" => Language::Burmese,
|
||||
"ne" => Language::Nepali,
|
||||
"si" => Language::Sinhalese,
|
||||
"km" => Language::Khmer,
|
||||
"tk" => Language::Turkmen,
|
||||
"am" => Language::Amharic,
|
||||
"az" => Language::Azerbaijani,
|
||||
"id" => Language::Indonesian,
|
||||
"te" => Language::Telugu,
|
||||
"ta" => Language::Tamil,
|
||||
"vi" => Language::Vietnamese,
|
||||
"gu" => Language::Gujarati,
|
||||
"pa" => Language::Punjabi,
|
||||
"uz" => Language::Uzbek,
|
||||
"hy" => Language::Armenian,
|
||||
"ka" => Language::Georgian,
|
||||
"la" => Language::Latin,
|
||||
"sl" => Language::Slovene,
|
||||
"hr" => Language::Croatian,
|
||||
"sr" => Language::Serbian,
|
||||
"mk" => Language::Macedonian,
|
||||
"lt" => Language::Lithuanian,
|
||||
"lv" => Language::Latvian,
|
||||
"et" => Language::Estonian,
|
||||
"tl" => Language::Tagalog,
|
||||
"af" => Language::Afrikaans,
|
||||
"zu" => Language::Zulu,
|
||||
"sn" => Language::Shona,
|
||||
"ak" => Language::Akan,
|
||||
_ => return None,
|
||||
}
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl BitmapKey<Vec<u8>> {
|
||||
pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self {
|
||||
|
@ -209,19 +73,3 @@ impl Operation {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Language {
|
||||
pub fn detect(text: String, default: Language) -> (String, Language) {
|
||||
if let Some((l, t)) = text
|
||||
.split_once(':')
|
||||
.and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
|
||||
{
|
||||
(t.to_string(), l)
|
||||
} else {
|
||||
let l = LanguageDetector::detect_single(&text)
|
||||
.and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
|
||||
.unwrap_or(default);
|
||||
(text, l)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,14 +21,14 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use nlp::language::{stemmer::Stemmer, Language};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{
|
||||
fts::{builder::MAX_TOKEN_LENGTH, stemmer::Stemmer, tokenizers::Tokenizer},
|
||||
BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
|
||||
fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
|
||||
};
|
||||
|
||||
use super::{term_index::TermIndex, Language};
|
||||
use super::term_index::TermIndex;
|
||||
|
||||
impl ReadTransaction<'_> {
|
||||
#[maybe_async::maybe_async]
|
||||
|
@ -44,7 +44,7 @@ impl ReadTransaction<'_> {
|
|||
if match_phrase {
|
||||
let mut phrase = Vec::new();
|
||||
let mut bit_keys = Vec::new();
|
||||
for token in Tokenizer::new(text, language, MAX_TOKEN_LENGTH) {
|
||||
for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) {
|
||||
let key = BitmapKey::hash(
|
||||
token.word.as_ref(),
|
||||
account_id,
|
||||
|
|
|
@ -134,12 +134,10 @@ pub fn generate_snippet(terms: &[Term], text: &str) -> Option<String> {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use nlp::language::Language;
|
||||
|
||||
use crate::{
|
||||
fts::{
|
||||
term_index::{TermIndex, TermIndexBuilder},
|
||||
tokenizers::Tokenizer,
|
||||
Language,
|
||||
},
|
||||
fts::term_index::{TermIndex, TermIndexBuilder},
|
||||
Deserialize, Serialize,
|
||||
};
|
||||
|
||||
|
@ -242,7 +240,7 @@ mod tests {
|
|||
|
||||
for (field_num, part) in parts.iter().enumerate() {
|
||||
let mut terms = Vec::new();
|
||||
for token in Tokenizer::new(part, Language::English, 40) {
|
||||
for token in Language::English.tokenize_text(part, 40) {
|
||||
terms.push(builder.add_token(token));
|
||||
}
|
||||
builder.add_terms(field_num as u8, 0, terms);
|
||||
|
|
|
@ -21,14 +21,13 @@
|
|||
* for more details.
|
||||
*/
|
||||
|
||||
use std::convert::TryInto;
|
||||
use std::{borrow::Cow, convert::TryInto};
|
||||
|
||||
use crate::{Deserialize, Serialize};
|
||||
|
||||
use super::{stemmer::StemmedToken, tokenizers::Token};
|
||||
|
||||
use ahash::{AHashMap, AHashSet};
|
||||
use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
|
||||
use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
|
||||
use utils::codec::leb128::{Leb128Reader, Leb128Vec};
|
||||
|
||||
#[derive(Debug)]
|
||||
|
@ -227,7 +226,7 @@ impl TermIndexBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn add_token(&mut self, token: Token) -> Term {
|
||||
pub fn add_token(&mut self, token: Token<Cow<str>>) -> Term {
|
||||
let id = self.terms.len() as u32;
|
||||
let id = self
|
||||
.terms
|
||||
|
@ -236,8 +235,8 @@ impl TermIndexBuilder {
|
|||
Term {
|
||||
id: *id,
|
||||
id_stemmed: *id,
|
||||
offset: token.offset,
|
||||
len: token.len,
|
||||
offset: token.from as u32,
|
||||
len: (token.to - token.from) as u8,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -259,8 +258,8 @@ impl TermIndexBuilder {
|
|||
Term {
|
||||
id,
|
||||
id_stemmed,
|
||||
offset: token.offset,
|
||||
len: token.len,
|
||||
offset: token.from as u32,
|
||||
len: (token.to - token.from) as u8,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -775,13 +774,10 @@ impl TokenIndex {
|
|||
mod tests {
|
||||
|
||||
use ahash::AHashMap;
|
||||
use nlp::language::{stemmer::Stemmer, Language};
|
||||
|
||||
use crate::{
|
||||
fts::{
|
||||
stemmer::Stemmer,
|
||||
term_index::{TermIndexBuilder, TokenIndex},
|
||||
Language,
|
||||
},
|
||||
fts::term_index::{TermIndexBuilder, TokenIndex},
|
||||
Deserialize, Serialize,
|
||||
};
|
||||
|
||||
|
|
|
@ -1,197 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::{borrow::Cow, vec::IntoIter};
|
||||
|
||||
use jieba_rs::Jieba;
|
||||
|
||||
use super::{word::WordTokenizer, Token};
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
lazy_static! {
|
||||
static ref JIEBA: Jieba = Jieba::new();
|
||||
}
|
||||
|
||||
pub struct ChineseTokenizer<'x> {
|
||||
word_tokenizer: WordTokenizer<'x>,
|
||||
tokens: IntoIter<&'x str>,
|
||||
token_offset: usize,
|
||||
token_len: usize,
|
||||
token_len_cur: usize,
|
||||
max_token_length: usize,
|
||||
}
|
||||
|
||||
impl<'x> ChineseTokenizer<'x> {
|
||||
pub fn new(text: &str, max_token_length: usize) -> ChineseTokenizer {
|
||||
ChineseTokenizer {
|
||||
word_tokenizer: WordTokenizer::new(text),
|
||||
tokens: Vec::new().into_iter(),
|
||||
max_token_length,
|
||||
token_offset: 0,
|
||||
token_len: 0,
|
||||
token_len_cur: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> Iterator for ChineseTokenizer<'x> {
|
||||
type Item = Token<'x>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some(ch_token) = self.tokens.next() {
|
||||
let offset_start = self.token_offset + self.token_len_cur;
|
||||
self.token_len_cur += ch_token.len();
|
||||
|
||||
if ch_token.len() <= self.max_token_length {
|
||||
return Token::new(offset_start, ch_token.len(), ch_token.into()).into();
|
||||
}
|
||||
} else {
|
||||
loop {
|
||||
let (token, is_ascii) = self.word_tokenizer.next()?;
|
||||
if !is_ascii {
|
||||
let word = match token.word {
|
||||
Cow::Borrowed(word) => word,
|
||||
Cow::Owned(_) => unreachable!(),
|
||||
};
|
||||
self.tokens = JIEBA.cut(word, false).into_iter();
|
||||
self.token_offset = token.offset as usize;
|
||||
self.token_len = token.len as usize;
|
||||
self.token_len_cur = 0;
|
||||
break;
|
||||
} else if token.len as usize <= self.max_token_length {
|
||||
return token.into();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn chinese_tokenizer() {
|
||||
assert_eq!(
|
||||
ChineseTokenizer::new(
|
||||
"孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。",
|
||||
40
|
||||
)
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
Token {
|
||||
word: "孫".into(),
|
||||
offset: 0,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "子".into(),
|
||||
offset: 3,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "曰".into(),
|
||||
offset: 6,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "兵".into(),
|
||||
offset: 12,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "者".into(),
|
||||
offset: 15,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "國".into(),
|
||||
offset: 21,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "之".into(),
|
||||
offset: 24,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "大事".into(),
|
||||
offset: 27,
|
||||
len: 6
|
||||
},
|
||||
Token {
|
||||
word: "死".into(),
|
||||
offset: 36,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "生".into(),
|
||||
offset: 39,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "之".into(),
|
||||
offset: 42,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "地".into(),
|
||||
offset: 45,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "存亡".into(),
|
||||
offset: 51,
|
||||
len: 6
|
||||
},
|
||||
Token {
|
||||
word: "之".into(),
|
||||
offset: 57,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "道".into(),
|
||||
offset: 60,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "不可不".into(),
|
||||
offset: 66,
|
||||
len: 9
|
||||
},
|
||||
Token {
|
||||
word: "察".into(),
|
||||
offset: 75,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "也".into(),
|
||||
offset: 78,
|
||||
len: 3
|
||||
}
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
|
@ -1,168 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::vec::IntoIter;
|
||||
|
||||
use super::{word::WordTokenizer, Token};
|
||||
|
||||
pub struct JapaneseTokenizer<'x> {
|
||||
word_tokenizer: WordTokenizer<'x>,
|
||||
tokens: IntoIter<String>,
|
||||
token_offset: usize,
|
||||
token_len: usize,
|
||||
token_len_cur: usize,
|
||||
max_token_length: usize,
|
||||
}
|
||||
|
||||
impl<'x> JapaneseTokenizer<'x> {
|
||||
pub fn new(text: &str, max_token_length: usize) -> JapaneseTokenizer {
|
||||
JapaneseTokenizer {
|
||||
word_tokenizer: WordTokenizer::new(text),
|
||||
tokens: Vec::new().into_iter(),
|
||||
max_token_length,
|
||||
token_offset: 0,
|
||||
token_len: 0,
|
||||
token_len_cur: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> Iterator for JapaneseTokenizer<'x> {
|
||||
type Item = Token<'x>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some(jp_token) = self.tokens.next() {
|
||||
let offset_start = self.token_offset + self.token_len_cur;
|
||||
self.token_len_cur += jp_token.len();
|
||||
|
||||
if jp_token.len() <= self.max_token_length {
|
||||
return Token::new(offset_start, jp_token.len(), jp_token.into()).into();
|
||||
}
|
||||
} else {
|
||||
loop {
|
||||
let (token, is_ascii) = self.word_tokenizer.next()?;
|
||||
if !is_ascii {
|
||||
self.tokens = tinysegmenter::tokenize(token.word.as_ref()).into_iter();
|
||||
self.token_offset = token.offset as usize;
|
||||
self.token_len = token.len as usize;
|
||||
self.token_len_cur = 0;
|
||||
break;
|
||||
} else if token.len as usize <= self.max_token_length {
|
||||
return token.into();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn japanese_tokenizer() {
|
||||
assert_eq!(
|
||||
JapaneseTokenizer::new("お先に失礼します あなたの名前は何ですか 123 abc-872", 40)
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
Token {
|
||||
word: "お先".into(),
|
||||
offset: 0,
|
||||
len: 6
|
||||
},
|
||||
Token {
|
||||
word: "に".into(),
|
||||
offset: 6,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "失礼".into(),
|
||||
offset: 9,
|
||||
len: 6
|
||||
},
|
||||
Token {
|
||||
word: "し".into(),
|
||||
offset: 15,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "ます".into(),
|
||||
offset: 18,
|
||||
len: 6
|
||||
},
|
||||
Token {
|
||||
word: "あなた".into(),
|
||||
offset: 25,
|
||||
len: 9
|
||||
},
|
||||
Token {
|
||||
word: "の".into(),
|
||||
offset: 34,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "名前".into(),
|
||||
offset: 37,
|
||||
len: 6
|
||||
},
|
||||
Token {
|
||||
word: "は".into(),
|
||||
offset: 43,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "何".into(),
|
||||
offset: 46,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "です".into(),
|
||||
offset: 49,
|
||||
len: 6
|
||||
},
|
||||
Token {
|
||||
word: "か".into(),
|
||||
offset: 55,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "123".into(),
|
||||
offset: 59,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "abc".into(),
|
||||
offset: 63,
|
||||
len: 3
|
||||
},
|
||||
Token {
|
||||
word: "872".into(),
|
||||
offset: 67,
|
||||
len: 3
|
||||
}
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
|
@ -1,96 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
pub mod chinese;
|
||||
pub mod indo_european;
|
||||
pub mod japanese;
|
||||
pub mod space;
|
||||
pub mod word;
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
use self::{
|
||||
chinese::ChineseTokenizer, indo_european::IndoEuropeanTokenizer, japanese::JapaneseTokenizer,
|
||||
};
|
||||
|
||||
use super::Language;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Token<'x> {
|
||||
pub word: Cow<'x, str>,
|
||||
pub offset: u32, // Word offset in the text part
|
||||
pub len: u8, // Word length
|
||||
}
|
||||
|
||||
impl<'x> Token<'x> {
|
||||
pub fn new(offset: usize, len: usize, word: Cow<'x, str>) -> Token<'x> {
|
||||
debug_assert!(offset <= u32::max_value() as usize);
|
||||
debug_assert!(len <= u8::max_value() as usize);
|
||||
Token {
|
||||
offset: offset as u32,
|
||||
len: len as u8,
|
||||
word,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum LanguageTokenizer<'x> {
|
||||
IndoEuropean(IndoEuropeanTokenizer<'x>),
|
||||
Japanese(JapaneseTokenizer<'x>),
|
||||
Chinese(ChineseTokenizer<'x>),
|
||||
}
|
||||
|
||||
pub struct Tokenizer<'x> {
|
||||
tokenizer: LanguageTokenizer<'x>,
|
||||
}
|
||||
|
||||
impl<'x> Tokenizer<'x> {
|
||||
pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Self {
|
||||
Tokenizer {
|
||||
tokenizer: match language {
|
||||
Language::Japanese => {
|
||||
LanguageTokenizer::Japanese(JapaneseTokenizer::new(text, max_token_length))
|
||||
}
|
||||
Language::Mandarin => {
|
||||
LanguageTokenizer::Chinese(ChineseTokenizer::new(text, max_token_length))
|
||||
}
|
||||
_ => LanguageTokenizer::IndoEuropean(IndoEuropeanTokenizer::new(
|
||||
text,
|
||||
max_token_length,
|
||||
)),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> Iterator for Tokenizer<'x> {
|
||||
type Item = Token<'x>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match &mut self.tokenizer {
|
||||
LanguageTokenizer::IndoEuropean(tokenizer) => tokenizer.next(),
|
||||
LanguageTokenizer::Chinese(tokenizer) => tokenizer.next(),
|
||||
LanguageTokenizer::Japanese(tokenizer) => tokenizer.next(),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Stalwart Labs Ltd.
|
||||
*
|
||||
* This file is part of Stalwart Mail Server.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as
|
||||
* published by the Free Software Foundation, either version 3 of
|
||||
* the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
* in the LICENSE file at the top-level directory of this distribution.
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* You can be released from the requirements of the AGPLv3 license by
|
||||
* purchasing a commercial license. Please contact licensing@stalw.art
|
||||
* for more details.
|
||||
*/
|
||||
|
||||
use std::str::CharIndices;
|
||||
|
||||
use super::Token;
|
||||
|
||||
pub struct WordTokenizer<'x> {
|
||||
text: &'x str,
|
||||
iterator: CharIndices<'x>,
|
||||
}
|
||||
|
||||
impl<'x> WordTokenizer<'x> {
|
||||
pub fn new(text: &str) -> WordTokenizer {
|
||||
WordTokenizer {
|
||||
text,
|
||||
iterator: text.char_indices(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses text into tokens, used by non-IndoEuropean tokenizers.
|
||||
impl<'x> Iterator for WordTokenizer<'x> {
|
||||
type Item = (Token<'x>, bool);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut is_ascii = true;
|
||||
while let Some((token_start, ch)) = self.iterator.next() {
|
||||
if ch.is_alphanumeric() {
|
||||
let token_end = (&mut self.iterator)
|
||||
.filter_map(|(pos, ch)| {
|
||||
if ch.is_alphanumeric() {
|
||||
if is_ascii && !ch.is_ascii() {
|
||||
is_ascii = false;
|
||||
}
|
||||
None
|
||||
} else {
|
||||
pos.into()
|
||||
}
|
||||
})
|
||||
.next()
|
||||
.unwrap_or(self.text.len());
|
||||
|
||||
let token_len = token_end - token_start;
|
||||
if token_end > token_start {
|
||||
return (
|
||||
Token::new(
|
||||
token_start,
|
||||
token_len,
|
||||
self.text[token_start..token_end].into(),
|
||||
),
|
||||
is_ascii,
|
||||
)
|
||||
.into();
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
|
@ -24,12 +24,10 @@
|
|||
use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};
|
||||
|
||||
use ahash::HashSet;
|
||||
use nlp::tokenizers::space::SpaceTokenizer;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{
|
||||
fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
|
||||
BitmapKey, ReadTransaction, Store,
|
||||
};
|
||||
use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, Store};
|
||||
|
||||
use super::{Filter, ResultSet, TextMatch};
|
||||
|
||||
|
|
|
@ -26,11 +26,10 @@ pub mod get;
|
|||
pub mod log;
|
||||
pub mod sort;
|
||||
|
||||
use nlp::language::Language;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{
|
||||
fts::Language, write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS,
|
||||
};
|
||||
use crate::{write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Operator {
|
||||
|
|
|
@ -23,11 +23,11 @@
|
|||
|
||||
use std::{collections::HashSet, slice::Iter, time::SystemTime};
|
||||
|
||||
use nlp::tokenizers::space::SpaceTokenizer;
|
||||
use utils::codec::leb128::{Leb128Iterator, Leb128Vec};
|
||||
|
||||
use crate::{
|
||||
fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
|
||||
Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
|
||||
fts::builder::MAX_TOKEN_LENGTH, Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
|
||||
};
|
||||
|
||||
use self::assert::AssertValue;
|
||||
|
|
|
@ -12,6 +12,7 @@ foundationdb = ["store/foundation"]
|
|||
|
||||
[dependencies]
|
||||
store = { path = "../crates/store", features = ["test_mode"] }
|
||||
nlp = { path = "../crates/nlp" }
|
||||
directory = { path = "../crates/directory" }
|
||||
jmap = { path = "../crates/jmap", features = ["test_mode"] }
|
||||
jmap_proto = { path = "../crates/jmap-proto" }
|
||||
|
|
|
@ -27,10 +27,11 @@ use std::{
|
|||
};
|
||||
|
||||
use jmap_proto::types::keyword::Keyword;
|
||||
use nlp::language::Language;
|
||||
use store::{ahash::AHashMap, query::sort::Pagination};
|
||||
|
||||
use store::{
|
||||
fts::{builder::FtsIndexBuilder, Language},
|
||||
fts::builder::FtsIndexBuilder,
|
||||
query::{Comparator, Filter},
|
||||
write::{BatchBuilder, F_BITMAP, F_INDEX, F_VALUE},
|
||||
Store, ValueKey,
|
||||
|
|
Loading…
Reference in a new issue