Remove inefficient TypesTokenizer peek (fixes #863)

This commit is contained in:
mdecimus 2024-10-15 16:59:05 +02:00
parent 4f49f810d8
commit ec3be62990
2 changed files with 86 additions and 117 deletions

169
Cargo.lock generated
View file

@ -306,9 +306,9 @@ dependencies = [
[[package]]
name = "async-compression"
version = "0.4.13"
version = "0.4.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429"
checksum = "e26a9844c659a2a293d239c7910b752f8487fe122c6c8bd1659bf85a6507c302"
dependencies = [
"flate2",
"futures-core",
@ -542,9 +542,9 @@ dependencies = [
[[package]]
name = "bindgen"
version = "0.69.4"
version = "0.69.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
dependencies = [
"bitflags 2.6.0",
"cexpr",
@ -552,15 +552,12 @@ dependencies = [
"itertools 0.12.1",
"lazy_static",
"lazycell",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
"rustc-hash 1.1.0",
"shlex",
"syn 2.0.79",
"which",
]
[[package]]
@ -573,6 +570,8 @@ dependencies = [
"cexpr",
"clang-sys",
"itertools 0.13.0",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
@ -798,9 +797,9 @@ dependencies = [
[[package]]
name = "bytemuck"
version = "1.18.0"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94bbb0ad554ad961ddc5da507a12a29b14e4ae5bda06b19f575a3e6079d2e2ae"
checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d"
[[package]]
name = "byteorder"
@ -865,9 +864,9 @@ dependencies = [
[[package]]
name = "cc"
version = "1.1.25"
version = "1.1.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8d9e0b4957f635b8d3da819d0db5603620467ecf1f692d22a8c2717ce27e6d8"
checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
dependencies = [
"jobserver",
"libc",
@ -972,9 +971,9 @@ dependencies = [
[[package]]
name = "clap"
version = "4.5.19"
version = "4.5.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615"
checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
dependencies = [
"clap_builder",
"clap_derive",
@ -982,9 +981,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.5.19"
version = "4.5.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b"
checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
dependencies = [
"anstream",
"anstyle",
@ -1969,16 +1968,6 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "errno"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "event_macro"
version = "0.1.0"
@ -2068,6 +2057,12 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foldhash"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"
[[package]]
name = "foreign-types"
version = "0.3.2"
@ -2110,9 +2105,9 @@ dependencies = [
[[package]]
name = "foundationdb"
version = "0.9.0"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "020bf4ae7238dbdb1ff01e9f981db028515cf66883c461e29faedfea130b2728"
checksum = "c30b4254cb4c01b50a74dad6aa48e17666726300b3adddb3c959a7e852f286bd"
dependencies = [
"async-recursion",
"async-trait",
@ -2131,18 +2126,18 @@ dependencies = [
[[package]]
name = "foundationdb-gen"
version = "0.9.0"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "36878d54a76a48e794d0fe89be2096ab5968b071e7ec25f7becfe7846f55fa77"
checksum = "93b9deedf92107a1076e518d60931b6ed1a632ae0c51e7b491bfd42edb4148ce"
dependencies = [
"xml-rs",
]
[[package]]
name = "foundationdb-macros"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8db6653cbc621a3810d95d55bd342be3e71181d6df21a4eb29ef986202d3f9c"
checksum = "afc84a5ff0dba78222551017f5625f3365aa09551c78cfaa44136fc6818c2611"
dependencies = [
"proc-macro2",
"quote",
@ -2152,11 +2147,11 @@ dependencies = [
[[package]]
name = "foundationdb-sys"
version = "0.9.0"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace2f49db8614b7d7e3b656a12e0059b5fbd0a4da3410b1797374bec3db269fa"
checksum = "3bae14dba30b8dcc4905a9189ebb18bc9db9744ef0ad8f2b94ef00d21e176964"
dependencies = [
"bindgen 0.69.4",
"bindgen 0.70.1",
"libc",
]
@ -2458,6 +2453,11 @@ name = "hashbrown"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
]
[[package]]
name = "hashlink"
@ -3327,9 +3327,9 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.70"
version = "0.3.72"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a"
checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9"
dependencies = [
"wasm-bindgen",
]
@ -3483,7 +3483,7 @@ version = "0.16.0+8.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce3d60bc059831dc1c83903fb45c103f75db65c5a7bf22272764d9cc683e348c"
dependencies = [
"bindgen 0.69.4",
"bindgen 0.69.5",
"bzip2-sys",
"cc",
"glob",
@ -3521,12 +3521,6 @@ version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linux-raw-sys"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "litemap"
version = "0.7.3"
@ -3557,11 +3551,11 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "lru"
version = "0.12.4"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904"
checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
dependencies = [
"hashbrown 0.14.5",
"hashbrown 0.15.0",
]
[[package]]
@ -4440,18 +4434,18 @@ dependencies = [
[[package]]
name = "pin-project"
version = "1.1.5"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3"
checksum = "baf123a161dde1e524adf36f90bc5d8d3462824a9c43553ad07a8183161189ec"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.5"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
checksum = "a4502d8515ca9f32f1fb543d987f63d95a14934883db45bdb48060b6b69257f8"
dependencies = [
"proc-macro2",
"quote",
@ -5494,19 +5488,6 @@ dependencies = [
"nom",
]
[[package]]
name = "rustix"
version = "0.38.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811"
dependencies = [
"bitflags 2.6.0",
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.52.0",
]
[[package]]
name = "rustls"
version = "0.21.12"
@ -5592,9 +5573,9 @@ dependencies = [
[[package]]
name = "rustls-pki-types"
version = "1.9.0"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55"
checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
[[package]]
name = "rustls-webpki"
@ -5619,9 +5600,9 @@ dependencies = [
[[package]]
name = "rustversion"
version = "1.0.17"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248"
[[package]]
name = "ryu"
@ -5655,18 +5636,18 @@ checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71"
[[package]]
name = "scc"
version = "2.2.0"
version = "2.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "836f1e0f4963ef5288b539b643b35e043e76a32d0f4e47e67febf69576527f50"
checksum = "f2c1f7fc6deb21665a9060dfc7d271be784669295a31babdcd4dd2c79ae8cbfb"
dependencies = [
"sdd",
]
[[package]]
name = "schannel"
version = "0.1.24"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b"
checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1"
dependencies = [
"windows-sys 0.59.0",
]
@ -5710,9 +5691,9 @@ dependencies = [
[[package]]
name = "sdd"
version = "3.0.3"
version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60a7b59a5d9b0099720b417b6325d91a52cbf5b3dcb5041d864be53eefa58abc"
checksum = "49c1eeaf4b6a87c7479688c6d52b9f1153cedd3c489300564f932b065c6eab95"
[[package]]
name = "seahash"
@ -7209,9 +7190,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
[[package]]
name = "wasm-bindgen"
version = "0.2.93"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5"
checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e"
dependencies = [
"cfg-if",
"once_cell",
@ -7220,9 +7201,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.93"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b"
checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358"
dependencies = [
"bumpalo",
"log",
@ -7235,9 +7216,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.43"
version = "0.4.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed"
checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b"
dependencies = [
"cfg-if",
"js-sys",
@ -7247,9 +7228,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.93"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf"
checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@ -7257,9 +7238,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.93"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68"
dependencies = [
"proc-macro2",
"quote",
@ -7270,9 +7251,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.93"
version = "0.2.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484"
checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d"
[[package]]
name = "wasm-streams"
@ -7289,9 +7270,9 @@ dependencies = [
[[package]]
name = "web-sys"
version = "0.3.70"
version = "0.3.72"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0"
checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112"
dependencies = [
"js-sys",
"wasm-bindgen",
@ -7332,18 +7313,6 @@ dependencies = [
"once_cell",
]
[[package]]
name = "which"
version = "4.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
dependencies = [
"either",
"home",
"once_cell",
"rustix",
]
[[package]]
name = "whoami"
version = "1.5.2"
@ -7383,7 +7352,7 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
"windows-sys 0.59.0",
"windows-sys 0.52.0",
]
[[package]]

View file

@ -8,6 +8,7 @@ use std::str::CharIndices;
use super::Token;
#[derive(Debug)]
pub struct TypesTokenizer<'x> {
text: &'x str,
iter: CharIndices<'x>,
@ -67,32 +68,30 @@ impl<'x> Iterator for TypesTokenizer<'x> {
}
// Try parsing email
if self.tokenize_emails
&& token.word.is_email_atom()
&& self.peek_has_tokens(
&[TokenType::Punctuation('@'), TokenType::Punctuation('.')],
TokenType::Space,
)
if self.tokenize_emails && token.word.is_email_atom()
/*&& self.peek_has_tokens(
&[TokenType::Punctuation('@'), TokenType::Punctuation('.')],
TokenType::Space,
)*/
{
self.peek_rewind();
if let Some(email) = self.try_parse_email() {
self.peek_advance();
return Some(email);
} else {
self.peek_rewind();
}
self.peek_rewind();
}
// Try parsing URL without scheme
if self.tokenize_urls_without_scheme
&& token.word.is_domain_atom(true)
&& self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space)
if self.tokenize_urls_without_scheme && token.word.is_domain_atom(true)
//&& self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space)
{
self.peek_rewind();
if let Some(url) = self.try_parse_url(None) {
self.peek_advance();
return Some(url);
} else {
self.peek_rewind();
}
self.peek_rewind();
}
// Try parsing currencies and floating point numbers
@ -243,14 +242,15 @@ impl<'x> TypesTokenizer<'x> {
}
}
#[inline(always)]
fn peek_rewind(&mut self) {
self.peek_pos = 0;
}
fn peek_has_tokens(
/*fn peek_has_tokens(
&mut self,
tokens: &[TokenType<&'_ str>],
stop_token: TokenType<&'_ str>,
stop_token: impl Fn(&TokenType<&'_ str>) -> bool,
) -> bool {
let mut tokens = tokens.iter().copied();
let mut token = tokens.next().unwrap();
@ -262,14 +262,14 @@ impl<'x> TypesTokenizer<'x> {
self.peek_rewind();
return true;
}
} else if t.word == stop_token {
} else if stop_token(&t.word) {
break;
}
}
self.peek_rewind();
false
}
}*/
fn try_parse_url(
&mut self,