mail-server/crates/nlp/src/tokenizers/chinese.rs
2023-10-11 19:21:11 +02:00

222 lines
6.8 KiB
Rust

/*
* Copyright (c) 2023, Stalwart Labs Ltd.
*
* This file is part of Stalwart Mail Server.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* in the LICENSE file at the top-level directory of this distribution.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can be released from the requirements of the AGPLv3 license by
* purchasing a commercial license. Please contact licensing@stalw.art
* for more details.
*/
use std::{borrow::Cow, vec::IntoIter};
use jieba_rs::Jieba;
use super::{InnerToken, Token};
use lazy_static::lazy_static;
lazy_static! {
pub static ref JIEBA: Jieba = Jieba::new();
}
pub struct ChineseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
tokenizer: T,
tokens: IntoIter<Token<I>>,
phantom: std::marker::PhantomData<&'x str>,
}
impl<'x, T, I> ChineseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
pub fn new(tokenizer: T) -> Self {
ChineseTokenizer {
tokenizer,
tokens: Vec::new().into_iter(),
phantom: std::marker::PhantomData,
}
}
}
impl<'x, T, I> Iterator for ChineseTokenizer<'x, T, I>
where
T: Iterator<Item = Token<I>>,
I: InnerToken<'x>,
{
type Item = Token<I>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(token) = self.tokens.next() {
return Some(token);
} else {
let token = self.tokenizer.next()?;
if token.word.is_alphabetic_8bit() {
let mut token_to = token.from;
match token.word.unwrap_alphabetic() {
Cow::Borrowed(word) => {
self.tokens = JIEBA
.cut(word, false)
.into_iter()
.map(|word| {
let token_from = token_to;
token_to += word.len();
Token {
word: I::new_alphabetic(word),
from: token_from,
to: token_to,
}
})
.collect::<Vec<_>>()
.into_iter();
}
Cow::Owned(word) => {
self.tokens = JIEBA
.cut(&word, false)
.into_iter()
.map(|word| {
let token_from = token_to;
token_to += word.len();
Token {
word: I::new_alphabetic(word.to_string()),
from: token_from,
to: token_to,
}
})
.collect::<Vec<_>>()
.into_iter();
}
}
} else {
return token.into();
}
}
}
}
}
#[cfg(test)]
mod tests {
use crate::tokenizers::{chinese::ChineseTokenizer, word::WordTokenizer, Token};
#[test]
fn chinese_tokenizer() {
assert_eq!(
ChineseTokenizer::new(WordTokenizer::new(
"孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。",
40
),)
.collect::<Vec<_>>(),
vec![
Token {
word: "".into(),
from: 0,
to: 3
},
Token {
word: "".into(),
from: 3,
to: 6
},
Token {
word: "".into(),
from: 6,
to: 9
},
Token {
word: "".into(),
from: 12,
to: 15
},
Token {
word: "".into(),
from: 15,
to: 18
},
Token {
word: "".into(),
from: 21,
to: 24
},
Token {
word: "".into(),
from: 24,
to: 27
},
Token {
word: "大事".into(),
from: 27,
to: 33
},
Token {
word: "".into(),
from: 36,
to: 39
},
Token {
word: "".into(),
from: 39,
to: 42
},
Token {
word: "".into(),
from: 42,
to: 45
},
Token {
word: "".into(),
from: 45,
to: 48
},
Token {
word: "存亡".into(),
from: 51,
to: 57
},
Token {
word: "".into(),
from: 57,
to: 60
},
Token {
word: "".into(),
from: 60,
to: 63
},
Token {
word: "不可不".into(),
from: 66,
to: 75
},
Token {
word: "".into(),
from: 75,
to: 78
},
Token {
word: "".into(),
from: 78,
to: 81
}
]
);
}
}