Bayes classifier, type tokenizer and NLP module reorganization

This commit is contained in:
mdecimus 2023-10-10 18:58:38 +02:00
parent a0812095ef
commit 3d9efd363a
53 changed files with 4651 additions and 944 deletions

View file

@ -5,7 +5,8 @@ All notable changes to this project will be documented in this file. This projec
## [0.3.9] - 2023-10-07
## Added
- Support for reading environment variables from configuration file using the `!ENV_VAR_NAME` special keyword.
- Support for reading environment variables from the configuration file using the `!ENV_VAR_NAME` special keyword.
- Option to disable ANSI color codes in logs.
### Changed
- Querying directories from a Sieve script is now done using the `query()` method from `eval`. Your scripts will need to be updated, please refer to the [new syntax](https://stalw.art/docs/smtp/filter/sieve#directory-queries).

116
Cargo.lock generated
View file

@ -169,13 +169,6 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "antispam"
version = "0.1.0"
dependencies = [
"fancy-regex",
]
[[package]]
name = "anyhow"
version = "1.0.75"
@ -1487,25 +1480,14 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.4"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "add4f07d43996f76ef320709726a556a9d4f965d9410d8d0271132d2f8293480"
checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
dependencies = [
"errno-dragonfly",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "etcetera"
version = "0.8.0"
@ -2252,6 +2234,7 @@ dependencies = [
"mail-parser",
"mail-send",
"md5",
"nlp",
"parking_lot",
"rustls 0.21.7",
"rustls-pemfile",
@ -2450,6 +2433,7 @@ dependencies = [
"mail-parser",
"mail-send",
"mime",
"nlp",
"p256",
"rand 0.8.5",
"rasn",
@ -2510,9 +2494,9 @@ dependencies = [
[[package]]
name = "jobserver"
version = "0.1.26"
version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d"
dependencies = [
"libc",
]
@ -2703,9 +2687,9 @@ dependencies = [
[[package]]
name = "linux-raw-sys"
version = "0.4.8"
version = "0.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db"
checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
[[package]]
name = "lock_api"
@ -2754,7 +2738,7 @@ dependencies = [
"mail-parser",
"parking_lot",
"quick-xml 0.30.0",
"ring 0.17.2",
"ring 0.17.3",
"rustls-pemfile",
"serde",
"serde_json",
@ -3001,6 +2985,30 @@ dependencies = [
"pin-utils",
]
[[package]]
name = "nlp"
version = "0.3.9"
dependencies = [
"ahash 0.8.3",
"bincode",
"farmhash",
"jieba-rs",
"lazy_static",
"nohash",
"rust-stemmers",
"serde",
"siphasher 1.0.0",
"tinysegmenter",
"whatlang",
"xxhash-rust",
]
[[package]]
name = "nohash"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0f889fb66f7acdf83442c35775764b51fed3c606ab9cee51500dbde2cf528ca"
[[package]]
name = "nom"
version = "7.1.3"
@ -3072,9 +3080,9 @@ dependencies = [
[[package]]
name = "num-traits"
version = "0.2.16"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
"libm",
@ -3476,7 +3484,7 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
"siphasher 0.3.11",
]
[[package]]
@ -3485,7 +3493,7 @@ version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
"siphasher 0.3.11",
]
[[package]]
@ -3791,9 +3799,9 @@ dependencies = [
[[package]]
name = "rasn"
version = "0.10.0"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cf5174961dbfd4f03b57e71e5a11b034f564d5f0b133d63e39d703ac3d2876b"
checksum = "4addd1a49756bcb131c2f686c6c833d2b63e4da7a0df07efd8c3de04b7efbdb2"
dependencies = [
"arrayvec",
"bitvec",
@ -3813,9 +3821,9 @@ dependencies = [
[[package]]
name = "rasn-cms"
version = "0.10.0"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56517898cf38bb50fdb6479049ed476510bf59ae7d329b35129dc8a8b309697f"
checksum = "e269b4df6eea0f54abd46afacd759b1c13a27e98da98a47ef3c405ef3568b0f5"
dependencies = [
"rasn",
"rasn-pkix",
@ -3823,9 +3831,9 @@ dependencies = [
[[package]]
name = "rasn-derive"
version = "0.10.0"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8def4ce07f970be91bad36c3090af419dcd9e696897ada3cf74bd480e0101d61"
checksum = "ba8242a16e3461b81333516ad8457906f52fdf21d087417fb59262c9ab406618"
dependencies = [
"either",
"itertools 0.10.5",
@ -3838,9 +3846,9 @@ dependencies = [
[[package]]
name = "rasn-pkix"
version = "0.10.0"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebdeef45b70d4c20ce34725707b2784c761eacaaa4d841eab46f9f9c6dc10dd3"
checksum = "06179c947a63fe9f9f5d73a539dcb13d90c6bdaeb03bd28b90ad796aff9fe6a8"
dependencies = [
"rasn",
]
@ -4024,9 +4032,9 @@ dependencies = [
[[package]]
name = "ring"
version = "0.17.2"
version = "0.17.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "911b295d2d302948838c8ac142da1ee09fa7863163b44e6715bc9357905878b8"
checksum = "9babe80d5c16becf6594aa32ad2be8fe08498e7ae60b77de8df700e67f191d7e"
dependencies = [
"cc",
"getrandom 0.2.10",
@ -4198,9 +4206,9 @@ dependencies = [
[[package]]
name = "rustix"
version = "0.38.17"
version = "0.38.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7"
checksum = "5a74ee2d7c2581cd139b42447d7d9389b889bdaad3a73f1ebb16f2a3237bb19c"
dependencies = [
"bitflags 2.4.0",
"errno",
@ -4644,6 +4652,12 @@ version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "siphasher"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe"
[[package]]
name = "slab"
version = "0.4.9"
@ -5048,10 +5062,10 @@ dependencies = [
"farmhash",
"foundationdb",
"futures",
"jieba-rs",
"lazy_static",
"lru-cache",
"maybe-async 0.2.7",
"nlp",
"num_cpus",
"parking_lot",
"r2d2",
@ -5061,14 +5075,11 @@ dependencies = [
"rocksdb",
"rusqlite",
"rust-s3",
"rust-stemmers",
"serde",
"siphasher",
"tinysegmenter",
"siphasher 1.0.0",
"tokio",
"tracing",
"utils",
"whatlang",
"xxhash-rust",
]
@ -5244,6 +5255,7 @@ dependencies = [
"mail-parser",
"mail-send",
"managesieve",
"nlp",
"num_cpus",
"rayon",
"reqwest",
@ -5358,9 +5370,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.32.0"
version = "1.33.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9"
checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653"
dependencies = [
"backtrace",
"bytes",
@ -6040,12 +6052,12 @@ dependencies = [
[[package]]
name = "webpki"
version = "0.22.2"
version = "0.22.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53"
dependencies = [
"ring 0.16.20",
"untrusted 0.7.1",
"ring 0.17.3",
"untrusted 0.9.0",
]
[[package]]

View file

@ -8,9 +8,9 @@ members = [
"crates/imap-proto",
"crates/smtp",
"crates/managesieve",
"crates/nlp",
"crates/store",
"crates/directory",
"crates/antispam",
"crates/utils",
"crates/maybe-async",
"crates/cli",

View file

@ -38,6 +38,7 @@ Key features:
- OAuth 2.0 [authorization code](https://www.rfc-editor.org/rfc/rfc8628) and [device authorization](https://www.rfc-editor.org/rfc/rfc8628) flows.
- Access Control Lists (ACLs).
- Rate limiting.
- Security audited (read the [report](https://stalw.art/blog/security-audit)).
- **Robust and scalable**:
- **FoundationDB** or **SQLite** database backends.
- **S3-compatible** blob storage support.

View file

@ -1,7 +0,0 @@
[package]
name = "antispam"
version = "0.1.0"
edition = "2021"
[dependencies]
fancy-regex = "0.11.0"

View file

@ -1,64 +0,0 @@
use std::path::PathBuf;
use import::spamassassin::import_spamassassin;
pub mod import;
fn main() {
import_spamassassin(
PathBuf::from("/Users/me/code/mail-server/resources/spamassassin"),
"cf".to_string(),
false,
);
}
const _IGNORE: &str = r#"
[antispam]
required-score = 5
add-headers = ["X-Spam-Checker-Version: SpamAssassin _VERSION_ (_SUBVERSION_) on _HOSTNAME_",
"X-Spam-Flag: _YESNOCAPS_", "X-Spam-Level: _STARS(*)_",
"X-Spam-Status: _YESNO_, score=_SCORE_ required=_REQD_ tests=_TESTS_ autolearn=_AUTOLEARN_ version=_VERSION_"]
originating-ip-headers = ["X-Yahoo-Post-IP", "X-Originating-IP", "X-Apparently-From",
"X-SenderIP X-AOL-IP", "X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp"]
rewrite-headers = ["Subject: [SPAM] _SUBJECT_"]
redirect-patterns = ["""m'/(?:index.php)?\?.*(?<=[?&])URL=(.*?)(?:$|[&\#])'i""",
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/url\?.*?(?<=[?&])q=(.*?)(?:$|[&\#])'i""",
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:site|inurl):(.*?)(?:$|%20|[\s+&\#])'i""",
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/search\?.*?(?<=[?&])q=[^&]*?(?<=%20|..[=+\s])(?:"|%22)(.*?)(?:$|%22|["\s+&\#])'i""",
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/translate\?.*?(?<=[?&])u=(.*?)(?:$|[&\#])'i""",
"""m'^https?:/*(?:\w+\.)?google(?:\.\w{2,3}){1,2}/pagead/iclk\?.*?(?<=[?&])adurl=(.*?)(?:$|[&\#])'i""",
"""m'^https?:/*(?:\w+\.)?aol\.com/redir\.adp\?.*(?<=[?&])_url=(.*?)(?:$|[&\#])'i""",
"""m'^https?/*(?:\w+\.)?facebook\.com/l/;(.*)'i""",
"""/^http:\/\/chkpt\.zdnet\.com\/chkpt\/\w+\/(.*)$/i""",
"""/^http:\/\/www(?:\d+)?\.nate\.com\/r\/\w+\/(.*)$/i""",
"""/^http:\/\/.+\.gov\/(?:.*\/)?externalLink\.jhtml\?.*url=(.*?)(?:&.*)?$/i""",
"""/^http:\/\/redir\.internet\.com\/.+?\/.+?\/(.*)$/i""",
"""/^http:\/\/(?:.*?\.)?adtech\.de\/.*(?:;|\|)link=(.*?)(?:;|$)/i""",
"""m'^http.*?/redirect\.php\?.*(?<=[?&])goto=(.*?)(?:$|[&\#])'i""",
"""m'^https?:/*(?:[^/]+\.)?emf\d\.com/r\.cfm.*?&r=(.*)'i"""
]
[antispam.autolearn]
enable = true
ignore-headers = [ "X-ACL-Warn", "X-Alimail-AntiSpam", "X-Amavis-Modified", "X-Anti*", "X-aol-global-disposition",
"X-ASF-*", "X-Assp-Version", "X-Authority-Analysis", "X-Authvirus", "X-Auto-Response-Suppress", "X-AV-Do-Run",
"X-AV-Status", "X-avast-antispam", "X-Backend", "X-Barracuda*", "X-Bayes*", "X-BitDefender*", "X-BL", "X-Bogosity",
"X-Boxtrapper", "X-Brightmail-Tracker", "X-BTI-AntiSpam", "X-Bugzilla-Version", "X-CanIt*", "X-Clapf-spamicity",
"X-Cloud-Security", "X-CM-Score", "X-CMAE-*", "X-Company", "X-Coremail-Antispam", "X-CRM114-*", "X-CT-Spam",
"X-CTCH-*", "X-Drweb-SpamState", "X-DSPAM*", "X-eavas*", "X-Enigmail-Version", "X-Eset*", "X-Exchange-Antispam-Report",
"X-ExtloopSabreCommercials1", "X-EYOU-SPAMVALUE", "X-FB-OUTBOUND-SPAM", "X-FEAS-SBL", "X-FILTER-SCORE", "X-Forefront*",
"X-Fuglu*", "X-getmail-filter-classifier", "X-GFIME-MASPAM", "X-Gmane-NNTP-Posting-Host", "X-GMX-Anti*", "X-He-Spam",
"X-hMailServer-Spam", "X-IAS", "X-iGspam-global", "X-Injected-Via-Gmane", "X-Interia-Antivirus", "X-IP-Spam-Verdict",
"X-Ironport*", "X-Junk*", "X-KLMS-*", "X-KMail-*", "X-MailCleaner-*", "X-MailFoundry", "X-MDMailLookup-Result",
"X-ME-*", "X-MessageFilter", "X-Microsoft-Antispam", "X-Mlf-Version", "X-MXScan-*", "X-NAI-Spam-*", "X-NetStation-Status",
"X-OVH-SPAM*", "X-PerlMx-*", "X-PFSI-Info", "X-PMX-*", "X-Policy-Service", "X-policyd-weight", "X-PreRBLs",
"X-Probable-Spam", "X-PROLinux-SpamCheck", "X-Proofpoint-*", "x-purgate-*", "X-Qmail-Scanner-*", "X-Quarantine-ID",
"X-RSpam-Report", "X-SA-*", "X-Scanned-by", "X-SmarterMail-CustomSpamHeader", "X-Spam*", "X-SPF-Scan-By", "X-STA-*",
"X-StarScan-Version", "X-SurGATE-Result", "X-SWITCHham-Score", "X-UI-*", "X-Univie*", "X-Virus*", "X-VR-*",
"X-WatchGuard*", "X-Whitelist-Domain", "X-WUM-CCI", "X_CMAE_Category" ]
threshold.ham = 0.1
threshold.spam = 12.0
"#;

View file

@ -10,6 +10,7 @@ jmap = { path = "../jmap" }
jmap_proto = { path = "../jmap-proto" }
directory = { path = "../directory" }
store = { path = "../store" }
nlp = { path = "../nlp" }
utils = { path = "../utils" }
mail-parser = { git = "https://github.com/stalwartlabs/mail-parser", features = ["full_encoding", "ludicrous_mode"] }
mail-send = { git = "https://github.com/stalwartlabs/mail-send", default-features = false, features = ["cram-md5", "skip-ehlo"] }

View file

@ -34,8 +34,9 @@ use imap_proto::{
use jmap_proto::types::{collection::Collection, id::Id, keyword::Keyword, property::Property};
use mail_parser::HeaderName;
use nlp::language::Language;
use store::{
fts::{builder::MAX_TOKEN_LENGTH, Language},
fts::builder::MAX_TOKEN_LENGTH,
query::{self, log::Query, sort::Pagination, ResultSet},
roaring::RoaringBitmap,
write::now,

View file

@ -6,6 +6,7 @@ resolver = "2"
[dependencies]
store = { path = "../store" }
nlp = { path = "../nlp" }
jmap_proto = { path = "../jmap-proto" }
smtp = { path = "../smtp" }
utils = { path = "../utils" }

View file

@ -23,10 +23,8 @@
use std::{str::FromStr, time::Duration};
use store::{
fts::Language,
rand::{distributions::Alphanumeric, thread_rng, Rng},
};
use nlp::language::Language;
use store::rand::{distributions::Alphanumeric, thread_rng, Rng};
use super::session::BaseCapabilities;

View file

@ -37,11 +37,9 @@ use mail_parser::{
parsers::{fields::thread::thread_name, preview::preview_text},
Addr, Address, GetHeader, Group, HeaderName, HeaderValue, Message, MessagePart, PartType,
};
use nlp::language::Language;
use store::{
fts::{
builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
Language,
},
fts::builder::{FtsIndexBuilder, MAX_TOKEN_LENGTH},
write::{BatchBuilder, IntoOperations, F_BITMAP, F_CLEAR, F_INDEX, F_VALUE},
};

View file

@ -28,8 +28,9 @@ use jmap_proto::{
types::{acl::Acl, collection::Collection, keyword::Keyword, property::Property},
};
use mail_parser::HeaderName;
use nlp::language::Language;
use store::{
fts::{builder::MAX_TOKEN_LENGTH, Language},
fts::builder::MAX_TOKEN_LENGTH,
query::{self},
roaring::RoaringBitmap,
ValueKey,

View file

@ -30,14 +30,12 @@ use jmap_proto::{
types::{acl::Acl, collection::Collection},
};
use mail_parser::{decoders::html::html_to_text, MessageParser, PartType};
use nlp::language::{stemmer::Stemmer, Language};
use store::{
fts::{
builder::MAX_TOKEN_LENGTH,
search_snippet::generate_snippet,
stemmer::Stemmer,
term_index::{self, TermIndex},
tokenizers::Tokenizer,
Language,
},
BlobKind,
};
@ -66,7 +64,8 @@ impl JMAP {
|| (text.starts_with('\'') && text.ends_with('\''))
{
terms.push(
Tokenizer::new(&text, language, MAX_TOKEN_LENGTH)
language
.tokenize_text(&text, MAX_TOKEN_LENGTH)
.map(|token| (token.word.into_owned(), None))
.collect::<Vec<_>>(),
);

View file

@ -40,6 +40,7 @@ use jmap_proto::{
},
types::{collection::Collection, property::Property},
};
use nlp::language::Language;
use services::{
delivery::spawn_delivery_manager,
housekeeper::{self, init_housekeeper, spawn_housekeeper},
@ -47,7 +48,6 @@ use services::{
};
use smtp::core::SMTP;
use store::{
fts::Language,
parking_lot::Mutex,
query::{sort::Pagination, Comparator, Filter, ResultSet, SortedResultSet},
roaring::RoaringBitmap,

View file

@ -27,9 +27,9 @@ use jmap_proto::{
object::{mailbox::QueryArguments, Object},
types::{acl::Acl, collection::Collection, property::Property, value::Value},
};
use nlp::language::Language;
use store::{
ahash::{AHashMap, AHashSet},
fts::Language,
query::{self, sort::Pagination},
roaring::RoaringBitmap,
};

View file

@ -28,10 +28,8 @@ use jmap_proto::{
},
types::{collection::Collection, property::Property},
};
use store::{
fts::Language,
query::{self},
};
use nlp::language::Language;
use store::query::{self};
use crate::JMAP;

19
crates/nlp/Cargo.toml Normal file
View file

@ -0,0 +1,19 @@
[package]
name = "nlp"
version = "0.3.9"
edition = "2021"
resolver = "2"
[dependencies]
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
farmhash = "1.1.5"
siphasher = "1.0"
serde = { version = "1.0", features = ["derive"]}
bincode = "1.3.3"
nohash = "0.2.0"
ahash = "0.8.3"
lazy_static = "1.4"
whatlang = "0.16" # Language detection
rust-stemmers = "1.2" # Stemmers
tinysegmenter = "0.1" # Japanese tokenizer
jieba-rs = "0.6" # Chinese stemmer

View file

@ -0,0 +1,77 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use nohash::IsEnabled;
use crate::transformers::osb::{Gram, OsbToken};
use super::TokenHash;
pub struct BloomHasher<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> {
buf: Vec<u8>,
tokens: T,
}
impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> BloomHasher<'x, T> {
pub fn new(tokens: T) -> Self {
Self {
buf: Vec::with_capacity(64),
tokens,
}
}
}
impl<'x, T: Iterator<Item = OsbToken<Gram<'x>>>> Iterator for BloomHasher<'x, T> {
type Item = OsbToken<TokenHash>;
fn next(&mut self) -> Option<Self::Item> {
self.tokens.next().map(|token| {
let bytes = match token.inner {
Gram::Uni { t1 } => t1.as_bytes(),
Gram::Bi { t1, t2, .. } => {
self.buf.clear();
self.buf.extend_from_slice(t1.as_bytes());
self.buf.push(b' ');
self.buf.extend_from_slice(t2.as_bytes());
&self.buf
}
};
OsbToken {
inner: TokenHash {
h1: xxhash_rust::xxh3::xxh3_64(bytes),
h2: farmhash::hash64(bytes),
},
idx: token.idx,
}
})
}
}
impl std::hash::Hash for TokenHash {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
state.write_u64(self.h1 ^ self.h2);
}
}
impl IsEnabled for TokenHash {}

View file

@ -0,0 +1,167 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use crate::transformers::osb::OsbToken;
use super::{BayesClassifier, Weights};
// Position 0 represents Unigram weights
const FEATURE_WEIGHT: [f64; 8] = [1.0, 3125.0, 256.0, 27.0, 1.0, 0.0, 0.0, 0.0];
impl BayesClassifier {
pub fn classify<T>(&self, tokens: T, ham_learns: u32, spam_learns: u32) -> Option<f64>
where
T: Iterator<Item = OsbToken<Weights>>,
{
if self.min_learns > 0 && (spam_learns < self.min_learns || ham_learns < self.min_learns) {
return None;
}
let mut processed_tokens = 0;
let mut total_spam_prob = 0.0;
let mut total_ham_prob = 0.0;
for token in tokens {
let weights = token.inner;
let total_count = weights.spam + weights.ham;
if total_count >= self.min_token_hits {
let total_count = total_count as f64;
let spam_freq = weights.spam as f64 / f64::max(1.0, spam_learns as f64);
let ham_freq = weights.ham as f64 / f64::max(1.0, ham_learns as f64);
let spam_prob = spam_freq / (spam_freq + ham_freq);
let ham_prob = ham_freq / (spam_freq + ham_freq);
let fw = FEATURE_WEIGHT[token.idx];
let w = (fw * total_count) / (1.0 + fw * total_count);
let bayes_spam_prob = prob_combine(spam_prob, total_count, w, 0.5);
if !((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + self.min_prob_strength)
|| (bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - self.min_prob_strength))
{
let bayes_ham_prob = prob_combine(ham_prob, total_count, w, 0.5);
total_spam_prob += bayes_spam_prob.ln();
total_ham_prob += bayes_ham_prob.ln();
processed_tokens += 1;
}
}
}
if processed_tokens == 0
|| self.min_tokens > 0 && processed_tokens < (self.min_tokens as f64 * 0.1) as u32
{
return None;
}
let (h, s) = if total_spam_prob > -300.0 && total_ham_prob > -300.0 {
/* Fisher value is low enough to apply inv_chi_square */
(
1.0 - inv_chi_square(total_spam_prob, processed_tokens),
1.0 - inv_chi_square(total_ham_prob, processed_tokens),
)
} else {
/* Use naive method */
if total_spam_prob < total_ham_prob {
let h = (1.0 - (total_spam_prob - total_ham_prob).exp())
/ (1.0 + (total_spam_prob - total_ham_prob).exp());
(h, 1.0 - h)
} else {
let s = (1.0 - (total_ham_prob - total_spam_prob).exp())
/ (1.0 + (total_ham_prob - total_spam_prob).exp());
(1.0 - s, s)
}
};
let final_prob = if h.is_finite() && s.is_finite() {
(s + 1.0 - h) / 2.0
} else {
/*
* We have some overflow, hence we need to check which class
* is NaN
*/
if h.is_finite() {
1.0
} else if s.is_finite() {
0.0
} else {
0.5
}
};
if processed_tokens > 0 && (final_prob - 0.5).abs() > 0.05 {
Some(final_prob)
} else {
None
}
}
}
/**
* Returns probability of chisquare > value with specified number of freedom
* degrees
*/
#[inline(always)]
fn inv_chi_square(value: f64, freedom_deg: u32) -> f64 {
let mut prob = value.exp();
if prob.is_finite() {
/*
* m is our confidence in class
* prob is e ^ x (small value since x is normally less than zero
* So we integrate over degrees of freedom and produce the total result
* from 1.0 (no confidence) to 0.0 (full confidence)
*/
let mut sum = prob;
let m = -value;
for i in 1..freedom_deg {
prob *= m / i as f64;
sum += prob;
}
f64::min(1.0, sum)
} else {
/*
* e^x where x is large *NEGATIVE* number is OK, so we have a very strong
* confidence that inv-chi-square is close to zero
*/
if value < 0.0 {
0.0
} else {
1.0
}
}
}
/*#[inline(always)]
fn normalize_probability(x: f64, bias: f64) -> f64 {
((x - bias) * 2.0).powi(8)
}*/
#[inline(always)]
fn prob_combine(prob: f64, cnt: f64, weight: f64, assumed: f64) -> f64 {
((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt))
}

View file

@ -0,0 +1,75 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::{collections::HashMap, hash::BuildHasherDefault};
use nohash::NoHashHasher;
use serde::{Deserialize, Serialize};
pub mod bloom;
pub mod classify;
pub mod train;
#[derive(Debug, Serialize, Deserialize, Default)]
pub struct BayesModel {
pub weights: HashMap<TokenHash, Weights, BuildHasherDefault<NoHashHasher<TokenHash>>>,
pub spam_learns: u32,
pub ham_learns: u32,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct BayesClassifier {
pub min_token_hits: u32,
pub min_tokens: u32,
pub min_prob_strength: f64,
pub min_learns: u32,
}
#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone, PartialEq, Eq)]
pub struct TokenHash {
h1: u64,
h2: u64,
}
#[derive(Debug, Serialize, Deserialize, Default, Copy, Clone)]
pub struct Weights {
spam: u32,
ham: u32,
}
impl BayesClassifier {
pub fn new() -> Self {
BayesClassifier {
min_token_hits: 2,
min_tokens: 11,
min_prob_strength: 0.05,
min_learns: 200,
}
}
}
impl Default for BayesClassifier {
fn default() -> Self {
Self::new()
}
}

View file

@ -0,0 +1,68 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use crate::transformers::osb::OsbToken;
use super::{BayesModel, TokenHash};
impl BayesModel {
pub fn train<T>(&mut self, tokens: T, is_spam: bool)
where
T: IntoIterator<Item = OsbToken<TokenHash>>,
{
if is_spam {
self.spam_learns += 1;
} else {
self.ham_learns += 1;
}
for token in tokens {
let hs = self.weights.entry(token.inner).or_default();
if is_spam {
hs.spam += 1;
} else {
hs.ham += 1;
}
}
}
pub fn untrain<T>(&mut self, tokens: T, is_spam: bool)
where
T: IntoIterator<Item = OsbToken<TokenHash>>,
{
if is_spam {
self.spam_learns -= 1;
} else {
self.ham_learns -= 1;
}
for token in tokens {
let hs = self.weights.entry(token.inner).or_default();
if is_spam {
hs.spam -= 1;
} else {
hs.ham -= 1;
}
}
}
}

View file

@ -0,0 +1,202 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::borrow::Cow;
use crate::tokenizers::{
chinese::ChineseTokenizer, japanese::JapaneseTokenizer, word::WordTokenizer, Token,
};
use self::detect::LanguageDetector;
pub mod detect;
pub mod stemmer;
pub type LanguageTokenizer<'x> = Box<dyn Iterator<Item = Token<Cow<'x, str>>> + 'x>;
impl Language {
pub fn tokenize_text<'x>(
&self,
text: &'x str,
max_token_length: usize,
) -> LanguageTokenizer<'x> {
match self {
Language::Japanese => Box::new(
JapaneseTokenizer::new(WordTokenizer::new(text, usize::MAX))
.filter(move |t| t.word.len() <= max_token_length),
),
Language::Mandarin => Box::new(
ChineseTokenizer::new(WordTokenizer::new(text, usize::MAX))
.filter(move |t| t.word.len() <= max_token_length),
),
_ => Box::new(WordTokenizer::new(text, max_token_length)),
}
}
}
#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
pub enum Language {
Esperanto = 0,
English = 1,
Russian = 2,
Mandarin = 3,
Spanish = 4,
Portuguese = 5,
Italian = 6,
Bengali = 7,
French = 8,
German = 9,
Ukrainian = 10,
Georgian = 11,
Arabic = 12,
Hindi = 13,
Japanese = 14,
Hebrew = 15,
Yiddish = 16,
Polish = 17,
Amharic = 18,
Javanese = 19,
Korean = 20,
Bokmal = 21,
Danish = 22,
Swedish = 23,
Finnish = 24,
Turkish = 25,
Dutch = 26,
Hungarian = 27,
Czech = 28,
Greek = 29,
Bulgarian = 30,
Belarusian = 31,
Marathi = 32,
Kannada = 33,
Romanian = 34,
Slovene = 35,
Croatian = 36,
Serbian = 37,
Macedonian = 38,
Lithuanian = 39,
Latvian = 40,
Estonian = 41,
Tamil = 42,
Vietnamese = 43,
Urdu = 44,
Thai = 45,
Gujarati = 46,
Uzbek = 47,
Punjabi = 48,
Azerbaijani = 49,
Indonesian = 50,
Telugu = 51,
Persian = 52,
Malayalam = 53,
Oriya = 54,
Burmese = 55,
Nepali = 56,
Sinhalese = 57,
Khmer = 58,
Turkmen = 59,
Akan = 60,
Zulu = 61,
Shona = 62,
Afrikaans = 63,
Latin = 64,
Slovak = 65,
Catalan = 66,
Tagalog = 67,
Armenian = 68,
Unknown = 69,
None = 70,
}
impl Language {
pub fn from_iso_639(code: &str) -> Option<Self> {
match code.split_once('-').map(|c| c.0).unwrap_or(code) {
"en" => Language::English,
"es" => Language::Spanish,
"pt" => Language::Portuguese,
"it" => Language::Italian,
"fr" => Language::French,
"de" => Language::German,
"ru" => Language::Russian,
"zh" => Language::Mandarin,
"ja" => Language::Japanese,
"ar" => Language::Arabic,
"hi" => Language::Hindi,
"ko" => Language::Korean,
"bn" => Language::Bengali,
"he" => Language::Hebrew,
"ur" => Language::Urdu,
"fa" => Language::Persian,
"ml" => Language::Malayalam,
"or" => Language::Oriya,
"my" => Language::Burmese,
"ne" => Language::Nepali,
"si" => Language::Sinhalese,
"km" => Language::Khmer,
"tk" => Language::Turkmen,
"am" => Language::Amharic,
"az" => Language::Azerbaijani,
"id" => Language::Indonesian,
"te" => Language::Telugu,
"ta" => Language::Tamil,
"vi" => Language::Vietnamese,
"gu" => Language::Gujarati,
"pa" => Language::Punjabi,
"uz" => Language::Uzbek,
"hy" => Language::Armenian,
"ka" => Language::Georgian,
"la" => Language::Latin,
"sl" => Language::Slovene,
"hr" => Language::Croatian,
"sr" => Language::Serbian,
"mk" => Language::Macedonian,
"lt" => Language::Lithuanian,
"lv" => Language::Latvian,
"et" => Language::Estonian,
"tl" => Language::Tagalog,
"af" => Language::Afrikaans,
"zu" => Language::Zulu,
"sn" => Language::Shona,
"ak" => Language::Akan,
_ => return None,
}
.into()
}
}
impl Language {
pub fn detect(text: String, default: Language) -> (String, Language) {
if let Some((l, t)) = text
.split_once(':')
.and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
{
(t.to_string(), l)
} else {
let l = LanguageDetector::detect_single(&text)
.and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
.unwrap_or(default);
(text, l)
}
}
}

View file

@ -25,25 +25,25 @@ use std::borrow::Cow;
use rust_stemmers::Algorithm;
use super::{tokenizers::Tokenizer, Language};
use super::{Language, LanguageTokenizer};
#[derive(Debug, PartialEq, Eq)]
pub struct StemmedToken<'x> {
pub word: Cow<'x, str>,
pub stemmed_word: Option<Cow<'x, str>>,
pub offset: u32, // Word offset in the text part
pub len: u8, // Word length
pub from: usize, // Word offset in the text part
pub to: usize, // Word length
}
pub struct Stemmer<'x> {
stemmer: Option<rust_stemmers::Stemmer>,
tokenizer: Tokenizer<'x>,
tokenizer: LanguageTokenizer<'x>,
}
impl<'x> Stemmer<'x> {
pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Stemmer<'x> {
Stemmer {
tokenizer: Tokenizer::new(text, language, max_token_length),
tokenizer: language.tokenize_text(text, max_token_length),
stemmer: STEMMER_MAP[language as usize].map(rust_stemmers::Stemmer::create),
}
}
@ -57,15 +57,15 @@ impl<'x> Iterator for Stemmer<'x> {
Some(StemmedToken {
stemmed_word: self.stemmer.as_ref().and_then(|stemmer| {
match stemmer.stem(&token.word) {
Cow::Owned(text) if text.len() != token.len as usize || text != token.word => {
Cow::Owned(text) if text.len() != token.word.len() || text != token.word => {
Some(text.into())
}
_ => None,
}
}),
word: token.word,
offset: token.offset,
len: token.len,
from: token.from,
to: token.to,
})
}
}

78
crates/nlp/src/lib.rs Normal file
View file

@ -0,0 +1,78 @@
use ahash::AHashSet;
pub mod bayes;
pub mod language;
pub mod tokenizers;
pub mod transformers;
#[derive(Debug, Clone, Default)]
pub struct PublicSuffix {
pub suffixes: AHashSet<String>,
pub exceptions: AHashSet<String>,
pub wildcards: Vec<String>,
}
impl PublicSuffix {
pub fn contains(&self, suffix: &str) -> bool {
self.suffixes.contains(suffix)
|| (!self.exceptions.contains(suffix)
&& self.wildcards.iter().any(|w| suffix.ends_with(w)))
}
}
#[cfg(test)]
mod test {
use std::fs;
use crate::{
bayes::{bloom::BloomHasher, BayesClassifier, BayesModel},
transformers::osb::{OsbToken, OsbTokenizer},
};
#[test]
#[ignore]
fn train() {
let db = fs::read_to_string("spam_or_not_spam.csv").unwrap();
let mut bayes = BayesModel::default();
for line in db.lines() {
let (text, is_spam) = line.rsplit_once(',').unwrap();
let is_spam = is_spam == "1";
bayes.train(
BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)),
is_spam,
);
}
println!("Ham: {} Spam: {}", bayes.ham_learns, bayes.spam_learns,);
fs::write("spam_or_not_spam.bin", bincode::serialize(&bayes).unwrap()).unwrap();
}
#[test]
#[ignore]
fn classify() {
let model: BayesModel =
bincode::deserialize(&fs::read("spam_or_not_spam.bin").unwrap()).unwrap();
let bayes = BayesClassifier::new();
for text in [
"i am attaching to this email a presentation to integrate the spreadsheet into our server",
"buy this great product special offer sales",
"i m using simple dns from jhsoft we support only a few web sites and i d like to swap secondary services with someone in a similar position",
"viagra xenical vioxx zyban propecia we only offer the real viagra xenical ",
] {
println!(
"{:?} -> {}",
text,
bayes
.classify(BloomHasher::new(OsbTokenizer::new(text.split_ascii_whitespace(), 5)).filter_map(|x| model.weights.get(&x.inner).map(|w| {
OsbToken {
idx: x.idx,
inner: *w,
}
})), model.ham_learns, model.spam_learns)
.unwrap()
);
}
}
}

View file

@ -0,0 +1,222 @@
/*
* Copyright (c) 2023, Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::{borrow::Cow, vec::IntoIter};
use jieba_rs::Jieba;
use super::{InnerToken, Token};
use lazy_static::lazy_static;
lazy_static! {
static ref JIEBA: Jieba = Jieba::new();
}
pub struct ChineseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
tokenizer: T,
tokens: IntoIter<Token<I>>,
phantom: std::marker::PhantomData<&'x str>,
}
impl<'x, T, I> ChineseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
pub fn new(tokenizer: T) -> Self {
ChineseTokenizer {
tokenizer,
tokens: Vec::new().into_iter(),
phantom: std::marker::PhantomData,
}
}
}
impl<'x, T, I> Iterator for ChineseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
type Item = Token<I>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(token) = self.tokens.next() {
return Some(token);
} else {
let token = self.tokenizer.next()?;
if token.word.is_alphabetic_8bit() {
let mut token_to = token.from;
match token.word.unwrap_alphabetic() {
Cow::Borrowed(word) => {
self.tokens = JIEBA
.cut(word, false)
.into_iter()
.map(|word| {
let token_from = token_to;
token_to += word.len();
Token {
word: I::new_alphabetic(word),
from: token_from,
to: token_to,
}
})
.collect::<Vec<_>>()
.into_iter();
}
Cow::Owned(word) => {
self.tokens = JIEBA
.cut(&word, false)
.into_iter()
.map(|word| {
let token_from = token_to;
token_to += word.len();
Token {
word: I::new_alphabetic(word.to_string()),
from: token_from,
to: token_to,
}
})
.collect::<Vec<_>>()
.into_iter();
}
}
} else {
return token.into();
}
}
}
}
}
#[cfg(test)]
mod tests {
use crate::tokenizers::{chinese::ChineseTokenizer, word::WordTokenizer, Token};
#[test]
fn chinese_tokenizer() {
assert_eq!(
ChineseTokenizer::new(WordTokenizer::new(
"孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。",
40
),)
.collect::<Vec<_>>(),
vec![
Token {
word: "".into(),
from: 0,
to: 3
},
Token {
word: "".into(),
from: 3,
to: 6
},
Token {
word: "".into(),
from: 6,
to: 9
},
Token {
word: "".into(),
from: 12,
to: 15
},
Token {
word: "".into(),
from: 15,
to: 18
},
Token {
word: "".into(),
from: 21,
to: 24
},
Token {
word: "".into(),
from: 24,
to: 27
},
Token {
word: "大事".into(),
from: 27,
to: 33
},
Token {
word: "".into(),
from: 36,
to: 39
},
Token {
word: "".into(),
from: 39,
to: 42
},
Token {
word: "".into(),
from: 42,
to: 45
},
Token {
word: "".into(),
from: 45,
to: 48
},
Token {
word: "存亡".into(),
from: 51,
to: 57
},
Token {
word: "".into(),
from: 57,
to: 60
},
Token {
word: "".into(),
from: 60,
to: 63
},
Token {
word: "不可不".into(),
from: 66,
to: 75
},
Token {
word: "".into(),
from: 75,
to: 78
},
Token {
word: "".into(),
from: 78,
to: 81
}
]
);
}
}

View file

@ -0,0 +1,179 @@
/*
* Copyright (c) 2023, Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::vec::IntoIter;
use super::{InnerToken, Token};
pub struct JapaneseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
tokenizer: T,
tokens: IntoIter<Token<I>>,
phantom: std::marker::PhantomData<&'x str>,
}
impl<'x, T, I> JapaneseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
pub fn new(tokenizer: T) -> Self {
JapaneseTokenizer {
tokenizer,
tokens: Vec::new().into_iter(),
phantom: std::marker::PhantomData,
}
}
}
impl<'x, T, I> Iterator for JapaneseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
type Item = Token<I>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(token) = self.tokens.next() {
return Some(token);
} else {
let token = self.tokenizer.next()?;
if token.word.is_alphabetic_8bit() {
let mut token_to = token.from;
self.tokens = tinysegmenter::tokenize(token.word.unwrap_alphabetic().as_ref())
.into_iter()
.map(|word| {
let token_from = token_to;
token_to += word.len();
Token {
word: I::new_alphabetic(word.to_string()),
from: token_from,
to: token_to,
}
})
.collect::<Vec<_>>()
.into_iter();
} else {
return token.into();
}
}
}
}
}
#[cfg(test)]
mod tests {
use crate::tokenizers::{japanese::JapaneseTokenizer, word::WordTokenizer, Token};
#[test]
fn japanese_tokenizer() {
assert_eq!(
JapaneseTokenizer::new(WordTokenizer::new(
"お先に失礼します あなたの名前は何ですか 123 abc-872",
40
))
.collect::<Vec<_>>(),
vec![
Token {
word: "お先".into(),
from: 0,
to: 6
},
Token {
word: "".into(),
from: 6,
to: 9
},
Token {
word: "失礼".into(),
from: 9,
to: 15
},
Token {
word: "".into(),
from: 15,
to: 18
},
Token {
word: "ます".into(),
from: 18,
to: 24
},
Token {
word: "あなた".into(),
from: 25,
to: 34
},
Token {
word: "".into(),
from: 34,
to: 37
},
Token {
word: "名前".into(),
from: 37,
to: 43
},
Token {
word: "".into(),
from: 43,
to: 46
},
Token {
word: "".into(),
from: 46,
to: 49
},
Token {
word: "です".into(),
from: 49,
to: 55
},
Token {
word: "".into(),
from: 55,
to: 58
},
Token {
word: "123".into(),
from: 59,
to: 62
},
Token {
word: "abc".into(),
from: 63,
to: 66
},
Token {
word: "872".into(),
from: 67,
to: 70
}
]
);
}
}

View file

@ -0,0 +1,74 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
pub mod chinese;
pub mod japanese;
pub mod space;
pub mod types;
pub mod word;
use std::borrow::Cow;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token<T> {
pub word: T,
pub from: usize,
pub to: usize,
}
pub trait InnerToken<'x>: Sized {
fn new_alphabetic(value: impl Into<Cow<'x, str>>) -> Self;
fn unwrap_alphabetic(self) -> Cow<'x, str>;
fn is_alphabetic(&self) -> bool;
fn is_alphabetic_8bit(&self) -> bool;
}
impl<'x> InnerToken<'x> for Cow<'x, str> {
fn new_alphabetic(value: impl Into<Cow<'x, str>>) -> Self {
value.into()
}
fn is_alphabetic(&self) -> bool {
true
}
fn is_alphabetic_8bit(&self) -> bool {
!self.chars().all(|c| c.is_ascii())
}
fn unwrap_alphabetic(self) -> Cow<'x, str> {
self
}
}
impl<T> Token<T> {
pub fn new(offset: usize, len: usize, word: T) -> Token<T> {
debug_assert!(offset <= u32::max_value() as usize);
debug_assert!(len <= u8::max_value() as usize);
Token {
from: offset,
to: offset + len,
word,
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -21,19 +21,19 @@
* for more details.
*/
use std::str::CharIndices;
use std::{borrow::Cow, str::CharIndices};
use super::Token;
pub struct IndoEuropeanTokenizer<'x> {
pub struct WordTokenizer<'x> {
max_token_length: usize,
text: &'x str,
iterator: CharIndices<'x>,
}
impl<'x> IndoEuropeanTokenizer<'x> {
pub fn new(text: &str, max_token_length: usize) -> IndoEuropeanTokenizer {
IndoEuropeanTokenizer {
impl<'x> WordTokenizer<'x> {
pub fn new(text: &str, max_token_length: usize) -> WordTokenizer {
WordTokenizer {
max_token_length,
text,
iterator: text.char_indices(),
@ -42,8 +42,8 @@ impl<'x> IndoEuropeanTokenizer<'x> {
}
/// Parses indo-european text into lowercase tokens.
impl<'x> Iterator for IndoEuropeanTokenizer<'x> {
type Item = Token<'x>;
impl<'x> Iterator for WordTokenizer<'x> {
type Item = Token<Cow<'x, str>>;
fn next(&mut self) -> Option<Self::Item> {
while let Some((token_start, ch)) = self.iterator.next() {
@ -159,7 +159,7 @@ mod tests {
];
for (input, tokens) in inputs.iter() {
for (pos, token) in IndoEuropeanTokenizer::new(input, 40).enumerate() {
for (pos, token) in WordTokenizer::new(input, 40).enumerate() {
assert_eq!(token, tokens[pos]);
}
}

View file

@ -21,41 +21,4 @@
* for more details.
*/
use std::borrow::Cow;
use super::bloom::{BloomFilter, BloomHashGroup};
pub trait ToNgrams: Sized {
fn new(items: usize) -> Self;
fn insert(&mut self, item: &str);
fn to_ngrams(tokens: &[Cow<'_, str>], n: usize) -> Self {
let mut filter = Self::new(tokens.len().saturating_sub(1));
for words in tokens.windows(n) {
filter.insert(&words.join(" "));
}
filter
}
}
impl ToNgrams for BloomFilter {
fn new(items: usize) -> Self {
BloomFilter::new(items)
}
fn insert(&mut self, item: &str) {
self.insert(&item.into())
}
}
impl ToNgrams for Vec<BloomHashGroup> {
fn new(items: usize) -> Self {
Vec::with_capacity(items)
}
fn insert(&mut self, item: &str) {
self.push(BloomHashGroup {
h1: item.into(),
h2: None,
})
}
}
pub mod osb;

View file

@ -0,0 +1,467 @@
/*
* Copyright (c) 2023 Stalwart Labs Ltd.
*
* This file is part of the Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::iter::Peekable;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct OsbToken<T> {
pub inner: T,
pub idx: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Gram<'x> {
Uni { t1: &'x str },
Bi { t1: &'x str, t2: &'x str },
}
pub struct OsbTokenizer<'x, I>
where
I: Iterator<Item = &'x str>,
{
iter: Peekable<I>,
buf: Vec<Option<&'x str>>,
window_size: usize,
window_pos: usize,
window_idx: usize,
}
impl<'x, I> OsbTokenizer<'x, I>
where
I: Iterator<Item = &'x str>,
{
pub fn new(iter: I, window_size: usize) -> Self {
Self {
iter: iter.peekable(),
buf: vec![None; window_size],
window_pos: 0,
window_idx: 0,
window_size,
}
}
}
impl<'x, I> Iterator for OsbTokenizer<'x, I>
where
I: Iterator<Item = &'x str>,
{
type Item = OsbToken<Gram<'x>>;
fn next(&mut self) -> Option<Self::Item> {
let end_pos = (self.window_pos + self.window_idx) % self.window_size;
if self.buf[end_pos].is_none() {
self.buf[end_pos] = self.iter.next();
}
let t1 = self.buf[self.window_pos % self.window_size]?;
let token = OsbToken {
inner: if self.window_idx != 0 {
Gram::Bi {
t1,
t2: self.buf[end_pos]?,
}
} else {
Gram::Uni { t1 }
},
idx: self.window_idx,
};
// Increment window
self.window_idx += 1;
if self.window_idx == self.window_size
|| (self.iter.peek().is_none()
&& self.buf[(self.window_pos + self.window_idx) % self.window_size].is_none())
{
self.buf[self.window_pos % self.window_size] = None;
self.window_idx = 0;
self.window_pos += 1;
}
Some(token)
}
}
#[cfg(test)]
mod test {
use crate::transformers::osb::{Gram, OsbToken};
#[test]
fn osb_tokenizer() {
assert_eq!(
super::OsbTokenizer::new(
"The quick brown fox jumps over the lazy dog and the lazy cat"
.split_ascii_whitespace(),
5
)
.collect::<Vec<_>>(),
vec![
OsbToken {
inner: Gram::Uni { t1: "The" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "The",
t2: "quick"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "The",
t2: "brown"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "The",
t2: "fox"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "The",
t2: "jumps"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "quick" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "quick",
t2: "brown"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "quick",
t2: "fox"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "quick",
t2: "jumps"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "quick",
t2: "over"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "brown" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "brown",
t2: "fox"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "brown",
t2: "jumps"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "brown",
t2: "over"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "brown",
t2: "the"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "fox" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "fox",
t2: "jumps"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "fox",
t2: "over"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "fox",
t2: "the"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "fox",
t2: "lazy"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "jumps" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "jumps",
t2: "over"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "jumps",
t2: "the"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "jumps",
t2: "lazy"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "jumps",
t2: "dog"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "over" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "over",
t2: "the"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "over",
t2: "lazy"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "over",
t2: "dog"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "over",
t2: "and"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "the" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "lazy"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "dog"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "and"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "the"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "lazy" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "dog"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "and"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "the"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "lazy"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "dog" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "dog",
t2: "and"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "dog",
t2: "the"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "dog",
t2: "lazy"
},
idx: 3
},
OsbToken {
inner: Gram::Bi {
t1: "dog",
t2: "cat"
},
idx: 4
},
OsbToken {
inner: Gram::Uni { t1: "and" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "and",
t2: "the"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "and",
t2: "lazy"
},
idx: 2
},
OsbToken {
inner: Gram::Bi {
t1: "and",
t2: "cat"
},
idx: 3
},
OsbToken {
inner: Gram::Uni { t1: "the" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "lazy"
},
idx: 1
},
OsbToken {
inner: Gram::Bi {
t1: "the",
t2: "cat"
},
idx: 2
},
OsbToken {
inner: Gram::Uni { t1: "lazy" },
idx: 0
},
OsbToken {
inner: Gram::Bi {
t1: "lazy",
t2: "cat"
},
idx: 1
},
OsbToken {
inner: Gram::Uni { t1: "cat" },
idx: 0
}
]
);
}
}

View file

@ -6,6 +6,7 @@ resolver = "2"
[dependencies]
utils = { path = "../utils" }
nlp = { path = "../nlp" }
maybe-async = { path = "../maybe-async" }
rocksdb = { version = "0.20.1", optional = true }
foundationdb = { version = "0.8.0", features = ["embedded-fdb-include"], optional = true }
@ -21,13 +22,9 @@ serde = { version = "1.0", features = ["derive"]}
ahash = { version = "0.8.0", features = ["serde"] }
bitpacking = "0.8.4"
lazy_static = "1.4"
whatlang = "0.16" # Language detection
rust-stemmers = "1.2" # Stemmers
tinysegmenter = "0.1" # Japanese tokenizer
jieba-rs = "0.6" # Chinese stemmer
xxhash-rust = { version = "0.8.5", features = ["xxh3"] }
farmhash = "1.1.5"
siphasher = "0.3"
siphasher = "1.0"
parking_lot = "0.12.1"
lru-cache = { version = "0.1.2", optional = true }
num_cpus = { version = "1.15.0", optional = true }

View file

@ -27,13 +27,12 @@ use std::{
hash::{Hash, Hasher},
};
use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
use roaring::RoaringBitmap;
use utils::codec::leb128::{Leb128Reader, Leb128Vec};
use crate::{Deserialize, Error, Serialize};
use super::{stemmer::StemmedToken, tokenizers::Token};
pub struct BloomFilter {
m: u64,
b: RoaringBitmap,
@ -204,8 +203,8 @@ impl From<Cow<'_, str>> for BloomHash {
}
}
impl From<Token<'_>> for BloomHashGroup {
fn from(t: Token<'_>) -> Self {
impl From<Token<Cow<'_, str>>> for BloomHashGroup {
fn from(t: Token<Cow<'_, str>>) -> Self {
Self {
h1: BloomHash::hash(t.word.as_ref()),
h2: None,

View file

@ -24,6 +24,14 @@
use std::{borrow::Cow, collections::HashSet};
use ahash::AHashSet;
use nlp::{
language::{
detect::{LanguageDetector, MIN_LANGUAGE_SCORE},
stemmer::Stemmer,
Language,
},
tokenizers::{space::SpaceTokenizer, Token},
};
use utils::map::vec_map::VecMap;
use crate::{
@ -32,13 +40,7 @@ use crate::{
Serialize, HASH_EXACT, HASH_STEMMED,
};
use super::{
lang::{LanguageDetector, MIN_LANGUAGE_SCORE},
stemmer::Stemmer,
term_index::{TermIndexBuilder, TokenIndex},
tokenizers::{space::SpaceTokenizer, Token},
Language,
};
use super::term_index::{TermIndexBuilder, TokenIndex};
pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 2) as usize;
pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1;
@ -138,8 +140,8 @@ impl<'x> IntoOperations for FtsIndexBuilder<'x> {
ops.insert(Operation::hash(&token, HASH_EXACT, field, true));
terms.push(term_index.add_token(Token {
word: token.into(),
offset: 0,
len: 0,
from: 0,
to: 0,
}));
}
term_index.add_terms(field, 0, terms);

View file

@ -26,149 +26,13 @@ use crate::{
BitmapKey, Serialize, BM_HASH,
};
use self::{bloom::hash_token, builder::MAX_TOKEN_MASK, lang::LanguageDetector};
use self::{bloom::hash_token, builder::MAX_TOKEN_MASK};
pub mod lang;
//pub mod pdf;
pub mod bloom;
pub mod builder;
pub mod ngram;
pub mod query;
pub mod search_snippet;
pub mod stemmer;
pub mod term_index;
pub mod tokenizers;
#[derive(Debug, PartialEq, Clone, Copy, Hash, Eq, serde::Serialize, serde::Deserialize)]
pub enum Language {
Esperanto = 0,
English = 1,
Russian = 2,
Mandarin = 3,
Spanish = 4,
Portuguese = 5,
Italian = 6,
Bengali = 7,
French = 8,
German = 9,
Ukrainian = 10,
Georgian = 11,
Arabic = 12,
Hindi = 13,
Japanese = 14,
Hebrew = 15,
Yiddish = 16,
Polish = 17,
Amharic = 18,
Javanese = 19,
Korean = 20,
Bokmal = 21,
Danish = 22,
Swedish = 23,
Finnish = 24,
Turkish = 25,
Dutch = 26,
Hungarian = 27,
Czech = 28,
Greek = 29,
Bulgarian = 30,
Belarusian = 31,
Marathi = 32,
Kannada = 33,
Romanian = 34,
Slovene = 35,
Croatian = 36,
Serbian = 37,
Macedonian = 38,
Lithuanian = 39,
Latvian = 40,
Estonian = 41,
Tamil = 42,
Vietnamese = 43,
Urdu = 44,
Thai = 45,
Gujarati = 46,
Uzbek = 47,
Punjabi = 48,
Azerbaijani = 49,
Indonesian = 50,
Telugu = 51,
Persian = 52,
Malayalam = 53,
Oriya = 54,
Burmese = 55,
Nepali = 56,
Sinhalese = 57,
Khmer = 58,
Turkmen = 59,
Akan = 60,
Zulu = 61,
Shona = 62,
Afrikaans = 63,
Latin = 64,
Slovak = 65,
Catalan = 66,
Tagalog = 67,
Armenian = 68,
Unknown = 69,
None = 70,
}
impl Language {
pub fn from_iso_639(code: &str) -> Option<Self> {
match code.split_once('-').map(|c| c.0).unwrap_or(code) {
"en" => Language::English,
"es" => Language::Spanish,
"pt" => Language::Portuguese,
"it" => Language::Italian,
"fr" => Language::French,
"de" => Language::German,
"ru" => Language::Russian,
"zh" => Language::Mandarin,
"ja" => Language::Japanese,
"ar" => Language::Arabic,
"hi" => Language::Hindi,
"ko" => Language::Korean,
"bn" => Language::Bengali,
"he" => Language::Hebrew,
"ur" => Language::Urdu,
"fa" => Language::Persian,
"ml" => Language::Malayalam,
"or" => Language::Oriya,
"my" => Language::Burmese,
"ne" => Language::Nepali,
"si" => Language::Sinhalese,
"km" => Language::Khmer,
"tk" => Language::Turkmen,
"am" => Language::Amharic,
"az" => Language::Azerbaijani,
"id" => Language::Indonesian,
"te" => Language::Telugu,
"ta" => Language::Tamil,
"vi" => Language::Vietnamese,
"gu" => Language::Gujarati,
"pa" => Language::Punjabi,
"uz" => Language::Uzbek,
"hy" => Language::Armenian,
"ka" => Language::Georgian,
"la" => Language::Latin,
"sl" => Language::Slovene,
"hr" => Language::Croatian,
"sr" => Language::Serbian,
"mk" => Language::Macedonian,
"lt" => Language::Lithuanian,
"lv" => Language::Latvian,
"et" => Language::Estonian,
"tl" => Language::Tagalog,
"af" => Language::Afrikaans,
"zu" => Language::Zulu,
"sn" => Language::Shona,
"ak" => Language::Akan,
_ => return None,
}
.into()
}
}
impl BitmapKey<Vec<u8>> {
pub fn hash(word: &str, account_id: u32, collection: u8, family: u8, field: u8) -> Self {
@ -209,19 +73,3 @@ impl Operation {
}
}
}
impl Language {
pub fn detect(text: String, default: Language) -> (String, Language) {
if let Some((l, t)) = text
.split_once(':')
.and_then(|(l, t)| (Language::from_iso_639(l)?, t).into())
{
(t.to_string(), l)
} else {
let l = LanguageDetector::detect_single(&text)
.and_then(|(l, c)| if c > 0.3 { Some(l) } else { None })
.unwrap_or(default);
(text, l)
}
}
}

View file

@ -21,14 +21,14 @@
* for more details.
*/
use nlp::language::{stemmer::Stemmer, Language};
use roaring::RoaringBitmap;
use crate::{
fts::{builder::MAX_TOKEN_LENGTH, stemmer::Stemmer, tokenizers::Tokenizer},
BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, ValueKey, HASH_EXACT, HASH_STEMMED,
};
use super::{term_index::TermIndex, Language};
use super::term_index::TermIndex;
impl ReadTransaction<'_> {
#[maybe_async::maybe_async]
@ -44,7 +44,7 @@ impl ReadTransaction<'_> {
if match_phrase {
let mut phrase = Vec::new();
let mut bit_keys = Vec::new();
for token in Tokenizer::new(text, language, MAX_TOKEN_LENGTH) {
for token in language.tokenize_text(text, MAX_TOKEN_LENGTH) {
let key = BitmapKey::hash(
token.word.as_ref(),
account_id,

View file

@ -134,12 +134,10 @@ pub fn generate_snippet(terms: &[Term], text: &str) -> Option<String> {
#[cfg(test)]
mod tests {
use nlp::language::Language;
use crate::{
fts::{
term_index::{TermIndex, TermIndexBuilder},
tokenizers::Tokenizer,
Language,
},
fts::term_index::{TermIndex, TermIndexBuilder},
Deserialize, Serialize,
};
@ -242,7 +240,7 @@ mod tests {
for (field_num, part) in parts.iter().enumerate() {
let mut terms = Vec::new();
for token in Tokenizer::new(part, Language::English, 40) {
for token in Language::English.tokenize_text(part, 40) {
terms.push(builder.add_token(token));
}
builder.add_terms(field_num as u8, 0, terms);

View file

@ -21,14 +21,13 @@
* for more details.
*/
use std::convert::TryInto;
use std::{borrow::Cow, convert::TryInto};
use crate::{Deserialize, Serialize};
use super::{stemmer::StemmedToken, tokenizers::Token};
use ahash::{AHashMap, AHashSet};
use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
use nlp::{language::stemmer::StemmedToken, tokenizers::Token};
use utils::codec::leb128::{Leb128Reader, Leb128Vec};
#[derive(Debug)]
@ -227,7 +226,7 @@ impl TermIndexBuilder {
}
}
pub fn add_token(&mut self, token: Token) -> Term {
pub fn add_token(&mut self, token: Token<Cow<str>>) -> Term {
let id = self.terms.len() as u32;
let id = self
.terms
@ -236,8 +235,8 @@ impl TermIndexBuilder {
Term {
id: *id,
id_stemmed: *id,
offset: token.offset,
len: token.len,
offset: token.from as u32,
len: (token.to - token.from) as u8,
}
}
@ -259,8 +258,8 @@ impl TermIndexBuilder {
Term {
id,
id_stemmed,
offset: token.offset,
len: token.len,
offset: token.from as u32,
len: (token.to - token.from) as u8,
}
}
@ -775,13 +774,10 @@ impl TokenIndex {
mod tests {
use ahash::AHashMap;
use nlp::language::{stemmer::Stemmer, Language};
use crate::{
fts::{
stemmer::Stemmer,
term_index::{TermIndexBuilder, TokenIndex},
Language,
},
fts::term_index::{TermIndexBuilder, TokenIndex},
Deserialize, Serialize,
};

View file

@ -1,197 +0,0 @@
/*
* Copyright (c) 2023, Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::{borrow::Cow, vec::IntoIter};
use jieba_rs::Jieba;
use super::{word::WordTokenizer, Token};
use lazy_static::lazy_static;
lazy_static! {
static ref JIEBA: Jieba = Jieba::new();
}
pub struct ChineseTokenizer<'x> {
word_tokenizer: WordTokenizer<'x>,
tokens: IntoIter<&'x str>,
token_offset: usize,
token_len: usize,
token_len_cur: usize,
max_token_length: usize,
}
impl<'x> ChineseTokenizer<'x> {
pub fn new(text: &str, max_token_length: usize) -> ChineseTokenizer {
ChineseTokenizer {
word_tokenizer: WordTokenizer::new(text),
tokens: Vec::new().into_iter(),
max_token_length,
token_offset: 0,
token_len: 0,
token_len_cur: 0,
}
}
}
impl<'x> Iterator for ChineseTokenizer<'x> {
type Item = Token<'x>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(ch_token) = self.tokens.next() {
let offset_start = self.token_offset + self.token_len_cur;
self.token_len_cur += ch_token.len();
if ch_token.len() <= self.max_token_length {
return Token::new(offset_start, ch_token.len(), ch_token.into()).into();
}
} else {
loop {
let (token, is_ascii) = self.word_tokenizer.next()?;
if !is_ascii {
let word = match token.word {
Cow::Borrowed(word) => word,
Cow::Owned(_) => unreachable!(),
};
self.tokens = JIEBA.cut(word, false).into_iter();
self.token_offset = token.offset as usize;
self.token_len = token.len as usize;
self.token_len_cur = 0;
break;
} else if token.len as usize <= self.max_token_length {
return token.into();
}
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn chinese_tokenizer() {
assert_eq!(
ChineseTokenizer::new(
"孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。",
40
)
.collect::<Vec<_>>(),
vec![
Token {
word: "".into(),
offset: 0,
len: 3
},
Token {
word: "".into(),
offset: 3,
len: 3
},
Token {
word: "".into(),
offset: 6,
len: 3
},
Token {
word: "".into(),
offset: 12,
len: 3
},
Token {
word: "".into(),
offset: 15,
len: 3
},
Token {
word: "".into(),
offset: 21,
len: 3
},
Token {
word: "".into(),
offset: 24,
len: 3
},
Token {
word: "大事".into(),
offset: 27,
len: 6
},
Token {
word: "".into(),
offset: 36,
len: 3
},
Token {
word: "".into(),
offset: 39,
len: 3
},
Token {
word: "".into(),
offset: 42,
len: 3
},
Token {
word: "".into(),
offset: 45,
len: 3
},
Token {
word: "存亡".into(),
offset: 51,
len: 6
},
Token {
word: "".into(),
offset: 57,
len: 3
},
Token {
word: "".into(),
offset: 60,
len: 3
},
Token {
word: "不可不".into(),
offset: 66,
len: 9
},
Token {
word: "".into(),
offset: 75,
len: 3
},
Token {
word: "".into(),
offset: 78,
len: 3
}
]
);
}
}

View file

@ -1,168 +0,0 @@
/*
* Copyright (c) 2023, Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::vec::IntoIter;
use super::{word::WordTokenizer, Token};
pub struct JapaneseTokenizer<'x> {
word_tokenizer: WordTokenizer<'x>,
tokens: IntoIter<String>,
token_offset: usize,
token_len: usize,
token_len_cur: usize,
max_token_length: usize,
}
impl<'x> JapaneseTokenizer<'x> {
pub fn new(text: &str, max_token_length: usize) -> JapaneseTokenizer {
JapaneseTokenizer {
word_tokenizer: WordTokenizer::new(text),
tokens: Vec::new().into_iter(),
max_token_length,
token_offset: 0,
token_len: 0,
token_len_cur: 0,
}
}
}
impl<'x> Iterator for JapaneseTokenizer<'x> {
type Item = Token<'x>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(jp_token) = self.tokens.next() {
let offset_start = self.token_offset + self.token_len_cur;
self.token_len_cur += jp_token.len();
if jp_token.len() <= self.max_token_length {
return Token::new(offset_start, jp_token.len(), jp_token.into()).into();
}
} else {
loop {
let (token, is_ascii) = self.word_tokenizer.next()?;
if !is_ascii {
self.tokens = tinysegmenter::tokenize(token.word.as_ref()).into_iter();
self.token_offset = token.offset as usize;
self.token_len = token.len as usize;
self.token_len_cur = 0;
break;
} else if token.len as usize <= self.max_token_length {
return token.into();
}
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn japanese_tokenizer() {
assert_eq!(
JapaneseTokenizer::new("お先に失礼します あなたの名前は何ですか 123 abc-872", 40)
.collect::<Vec<_>>(),
vec![
Token {
word: "お先".into(),
offset: 0,
len: 6
},
Token {
word: "".into(),
offset: 6,
len: 3
},
Token {
word: "失礼".into(),
offset: 9,
len: 6
},
Token {
word: "".into(),
offset: 15,
len: 3
},
Token {
word: "ます".into(),
offset: 18,
len: 6
},
Token {
word: "あなた".into(),
offset: 25,
len: 9
},
Token {
word: "".into(),
offset: 34,
len: 3
},
Token {
word: "名前".into(),
offset: 37,
len: 6
},
Token {
word: "".into(),
offset: 43,
len: 3
},
Token {
word: "".into(),
offset: 46,
len: 3
},
Token {
word: "です".into(),
offset: 49,
len: 6
},
Token {
word: "".into(),
offset: 55,
len: 3
},
Token {
word: "123".into(),
offset: 59,
len: 3
},
Token {
word: "abc".into(),
offset: 63,
len: 3
},
Token {
word: "872".into(),
offset: 67,
len: 3
}
]
);
}
}

View file

@ -1,96 +0,0 @@
/*
* Copyright (c) 2023, Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
pub mod chinese;
pub mod indo_european;
pub mod japanese;
pub mod space;
pub mod word;
use std::borrow::Cow;
use self::{
chinese::ChineseTokenizer, indo_european::IndoEuropeanTokenizer, japanese::JapaneseTokenizer,
};
use super::Language;
#[derive(Debug, PartialEq, Eq)]
pub struct Token<'x> {
pub word: Cow<'x, str>,
pub offset: u32, // Word offset in the text part
pub len: u8, // Word length
}
impl<'x> Token<'x> {
pub fn new(offset: usize, len: usize, word: Cow<'x, str>) -> Token<'x> {
debug_assert!(offset <= u32::max_value() as usize);
debug_assert!(len <= u8::max_value() as usize);
Token {
offset: offset as u32,
len: len as u8,
word,
}
}
}
enum LanguageTokenizer<'x> {
IndoEuropean(IndoEuropeanTokenizer<'x>),
Japanese(JapaneseTokenizer<'x>),
Chinese(ChineseTokenizer<'x>),
}
pub struct Tokenizer<'x> {
tokenizer: LanguageTokenizer<'x>,
}
impl<'x> Tokenizer<'x> {
pub fn new(text: &'x str, language: Language, max_token_length: usize) -> Self {
Tokenizer {
tokenizer: match language {
Language::Japanese => {
LanguageTokenizer::Japanese(JapaneseTokenizer::new(text, max_token_length))
}
Language::Mandarin => {
LanguageTokenizer::Chinese(ChineseTokenizer::new(text, max_token_length))
}
_ => LanguageTokenizer::IndoEuropean(IndoEuropeanTokenizer::new(
text,
max_token_length,
)),
},
}
}
}
impl<'x> Iterator for Tokenizer<'x> {
type Item = Token<'x>;
fn next(&mut self) -> Option<Self::Item> {
match &mut self.tokenizer {
LanguageTokenizer::IndoEuropean(tokenizer) => tokenizer.next(),
LanguageTokenizer::Chinese(tokenizer) => tokenizer.next(),
LanguageTokenizer::Japanese(tokenizer) => tokenizer.next(),
}
}
}

View file

@ -1,80 +0,0 @@
/*
* Copyright (c) 2023, Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::str::CharIndices;
use super::Token;
pub struct WordTokenizer<'x> {
text: &'x str,
iterator: CharIndices<'x>,
}
impl<'x> WordTokenizer<'x> {
pub fn new(text: &str) -> WordTokenizer {
WordTokenizer {
text,
iterator: text.char_indices(),
}
}
}
/// Parses text into tokens, used by non-IndoEuropean tokenizers.
impl<'x> Iterator for WordTokenizer<'x> {
type Item = (Token<'x>, bool);
fn next(&mut self) -> Option<Self::Item> {
let mut is_ascii = true;
while let Some((token_start, ch)) = self.iterator.next() {
if ch.is_alphanumeric() {
let token_end = (&mut self.iterator)
.filter_map(|(pos, ch)| {
if ch.is_alphanumeric() {
if is_ascii && !ch.is_ascii() {
is_ascii = false;
}
None
} else {
pos.into()
}
})
.next()
.unwrap_or(self.text.len());
let token_len = token_end - token_start;
if token_end > token_start {
return (
Token::new(
token_start,
token_len,
self.text[token_start..token_end].into(),
),
is_ascii,
)
.into();
}
}
}
None
}
}

View file

@ -24,12 +24,10 @@
use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign};
use ahash::HashSet;
use nlp::tokenizers::space::SpaceTokenizer;
use roaring::RoaringBitmap;
use crate::{
fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
BitmapKey, ReadTransaction, Store,
};
use crate::{fts::builder::MAX_TOKEN_LENGTH, BitmapKey, ReadTransaction, Store};
use super::{Filter, ResultSet, TextMatch};

View file

@ -26,11 +26,10 @@ pub mod get;
pub mod log;
pub mod sort;
use nlp::language::Language;
use roaring::RoaringBitmap;
use crate::{
fts::Language, write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS,
};
use crate::{write::BitmapFamily, BitmapKey, Deserialize, Serialize, BM_DOCUMENT_IDS};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Operator {

View file

@ -23,11 +23,11 @@
use std::{collections::HashSet, slice::Iter, time::SystemTime};
use nlp::tokenizers::space::SpaceTokenizer;
use utils::codec::leb128::{Leb128Iterator, Leb128Vec};
use crate::{
fts::{builder::MAX_TOKEN_LENGTH, tokenizers::space::SpaceTokenizer},
Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
fts::builder::MAX_TOKEN_LENGTH, Deserialize, Serialize, BM_TAG, HASH_EXACT, TAG_ID, TAG_STATIC,
};
use self::assert::AssertValue;

View file

@ -12,6 +12,7 @@ foundationdb = ["store/foundation"]
[dependencies]
store = { path = "../crates/store", features = ["test_mode"] }
nlp = { path = "../crates/nlp" }
directory = { path = "../crates/directory" }
jmap = { path = "../crates/jmap", features = ["test_mode"] }
jmap_proto = { path = "../crates/jmap-proto" }

View file

@ -27,10 +27,11 @@ use std::{
};
use jmap_proto::types::keyword::Keyword;
use nlp::language::Language;
use store::{ahash::AHashMap, query::sort::Pagination};
use store::{
fts::{builder::FtsIndexBuilder, Language},
fts::builder::FtsIndexBuilder,
query::{Comparator, Filter},
write::{BatchBuilder, F_BITMAP, F_INDEX, F_VALUE},
Store, ValueKey,