From ec3be629903d27d35e75d9b08f08531e8f2737a8 Mon Sep 17 00:00:00 2001 From: mdecimus Date: Tue, 15 Oct 2024 16:59:05 +0200 Subject: [PATCH] Remove inefficient TypesTokenizer peek (fixes #863) --- Cargo.lock | 169 ++++++++++++----------------- crates/nlp/src/tokenizers/types.rs | 34 +++--- 2 files changed, 86 insertions(+), 117 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 77b570e2..92b15c5d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.13" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429" +checksum = "e26a9844c659a2a293d239c7910b752f8487fe122c6c8bd1659bf85a6507c302" dependencies = [ "flate2", "futures-core", @@ -542,9 +542,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.4" +version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ "bitflags 2.6.0", "cexpr", @@ -552,15 +552,12 @@ dependencies = [ "itertools 0.12.1", "lazy_static", "lazycell", - "log", - "prettyplease", "proc-macro2", "quote", "regex", "rustc-hash 1.1.0", "shlex", "syn 2.0.79", - "which", ] [[package]] @@ -573,6 +570,8 @@ dependencies = [ "cexpr", "clang-sys", "itertools 0.13.0", + "log", + "prettyplease", "proc-macro2", "quote", "regex", @@ -798,9 +797,9 @@ dependencies = [ [[package]] name = "bytemuck" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94bbb0ad554ad961ddc5da507a12a29b14e4ae5bda06b19f575a3e6079d2e2ae" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" [[package]] name = "byteorder" @@ -865,9 +864,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.25" +version = "1.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8d9e0b4957f635b8d3da819d0db5603620467ecf1f692d22a8c2717ce27e6d8" +checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" dependencies = [ "jobserver", "libc", @@ -972,9 +971,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.19" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" dependencies = [ "clap_builder", "clap_derive", @@ -982,9 +981,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.19" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" dependencies = [ "anstream", "anstyle", @@ -1969,16 +1968,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" -[[package]] -name = "errno" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "event_macro" version = "0.1.0" @@ -2068,6 +2057,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" + [[package]] name = "foreign-types" version = "0.3.2" @@ -2110,9 +2105,9 @@ dependencies = [ [[package]] name = "foundationdb" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "020bf4ae7238dbdb1ff01e9f981db028515cf66883c461e29faedfea130b2728" +checksum = "c30b4254cb4c01b50a74dad6aa48e17666726300b3adddb3c959a7e852f286bd" dependencies = [ "async-recursion", "async-trait", @@ -2131,18 +2126,18 @@ dependencies = [ [[package]] name = "foundationdb-gen" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36878d54a76a48e794d0fe89be2096ab5968b071e7ec25f7becfe7846f55fa77" +checksum = "93b9deedf92107a1076e518d60931b6ed1a632ae0c51e7b491bfd42edb4148ce" dependencies = [ "xml-rs", ] [[package]] name = "foundationdb-macros" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8db6653cbc621a3810d95d55bd342be3e71181d6df21a4eb29ef986202d3f9c" +checksum = "afc84a5ff0dba78222551017f5625f3365aa09551c78cfaa44136fc6818c2611" dependencies = [ "proc-macro2", "quote", @@ -2152,11 +2147,11 @@ dependencies = [ [[package]] name = "foundationdb-sys" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ace2f49db8614b7d7e3b656a12e0059b5fbd0a4da3410b1797374bec3db269fa" +checksum = "3bae14dba30b8dcc4905a9189ebb18bc9db9744ef0ad8f2b94ef00d21e176964" dependencies = [ - "bindgen 0.69.4", + "bindgen 0.70.1", "libc", ] @@ -2458,6 +2453,11 @@ name = "hashbrown" version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "hashlink" @@ -3327,9 +3327,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" dependencies = [ "wasm-bindgen", ] @@ -3483,7 +3483,7 @@ version = "0.16.0+8.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce3d60bc059831dc1c83903fb45c103f75db65c5a7bf22272764d9cc683e348c" dependencies = [ - "bindgen 0.69.4", + "bindgen 0.69.5", "bzip2-sys", "cc", "glob", @@ -3521,12 +3521,6 @@ version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" -[[package]] -name = "linux-raw-sys" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" - [[package]] name = "litemap" version = "0.7.3" @@ -3557,11 +3551,11 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lru" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.14.5", + "hashbrown 0.15.0", ] [[package]] @@ -4440,18 +4434,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.5" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +checksum = "baf123a161dde1e524adf36f90bc5d8d3462824a9c43553ad07a8183161189ec" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.5" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +checksum = "a4502d8515ca9f32f1fb543d987f63d95a14934883db45bdb48060b6b69257f8" dependencies = [ "proc-macro2", "quote", @@ -5494,19 +5488,6 @@ dependencies = [ "nom", ] -[[package]] -name = "rustix" -version = "0.38.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" -dependencies = [ - "bitflags 2.6.0", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.52.0", -] - [[package]] name = "rustls" version = "0.21.12" @@ -5592,9 +5573,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" +checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" [[package]] name = "rustls-webpki" @@ -5619,9 +5600,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" [[package]] name = "ryu" @@ -5655,18 +5636,18 @@ checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71" [[package]] name = "scc" -version = "2.2.0" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836f1e0f4963ef5288b539b643b35e043e76a32d0f4e47e67febf69576527f50" +checksum = "f2c1f7fc6deb21665a9060dfc7d271be784669295a31babdcd4dd2c79ae8cbfb" dependencies = [ "sdd", ] [[package]] name = "schannel" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" +checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" dependencies = [ "windows-sys 0.59.0", ] @@ -5710,9 +5691,9 @@ dependencies = [ [[package]] name = "sdd" -version = "3.0.3" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a7b59a5d9b0099720b417b6325d91a52cbf5b3dcb5041d864be53eefa58abc" +checksum = "49c1eeaf4b6a87c7479688c6d52b9f1153cedd3c489300564f932b065c6eab95" [[package]] name = "seahash" @@ -7209,9 +7190,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" dependencies = [ "cfg-if", "once_cell", @@ -7220,9 +7201,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" dependencies = [ "bumpalo", "log", @@ -7235,9 +7216,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.43" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" +checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" dependencies = [ "cfg-if", "js-sys", @@ -7247,9 +7228,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7257,9 +7238,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", @@ -7270,9 +7251,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] name = "wasm-streams" @@ -7289,9 +7270,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" dependencies = [ "js-sys", "wasm-bindgen", @@ -7332,18 +7313,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "which" -version = "4.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" -dependencies = [ - "either", - "home", - "once_cell", - "rustix", -] - [[package]] name = "whoami" version = "1.5.2" @@ -7383,7 +7352,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] diff --git a/crates/nlp/src/tokenizers/types.rs b/crates/nlp/src/tokenizers/types.rs index cb12035e..e5b77677 100644 --- a/crates/nlp/src/tokenizers/types.rs +++ b/crates/nlp/src/tokenizers/types.rs @@ -8,6 +8,7 @@ use std::str::CharIndices; use super::Token; +#[derive(Debug)] pub struct TypesTokenizer<'x> { text: &'x str, iter: CharIndices<'x>, @@ -67,32 +68,30 @@ impl<'x> Iterator for TypesTokenizer<'x> { } // Try parsing email - if self.tokenize_emails - && token.word.is_email_atom() - && self.peek_has_tokens( - &[TokenType::Punctuation('@'), TokenType::Punctuation('.')], - TokenType::Space, - ) + if self.tokenize_emails && token.word.is_email_atom() + /*&& self.peek_has_tokens( + &[TokenType::Punctuation('@'), TokenType::Punctuation('.')], + TokenType::Space, + )*/ { + self.peek_rewind(); if let Some(email) = self.try_parse_email() { self.peek_advance(); return Some(email); - } else { - self.peek_rewind(); } + self.peek_rewind(); } // Try parsing URL without scheme - if self.tokenize_urls_without_scheme - && token.word.is_domain_atom(true) - && self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space) + if self.tokenize_urls_without_scheme && token.word.is_domain_atom(true) + //&& self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space) { + self.peek_rewind(); if let Some(url) = self.try_parse_url(None) { self.peek_advance(); return Some(url); - } else { - self.peek_rewind(); } + self.peek_rewind(); } // Try parsing currencies and floating point numbers @@ -243,14 +242,15 @@ impl<'x> TypesTokenizer<'x> { } } + #[inline(always)] fn peek_rewind(&mut self) { self.peek_pos = 0; } - fn peek_has_tokens( + /*fn peek_has_tokens( &mut self, tokens: &[TokenType<&'_ str>], - stop_token: TokenType<&'_ str>, + stop_token: impl Fn(&TokenType<&'_ str>) -> bool, ) -> bool { let mut tokens = tokens.iter().copied(); let mut token = tokens.next().unwrap(); @@ -262,14 +262,14 @@ impl<'x> TypesTokenizer<'x> { self.peek_rewind(); return true; } - } else if t.word == stop_token { + } else if stop_token(&t.word) { break; } } self.peek_rewind(); false - } + }*/ fn try_parse_url( &mut self,