/* * Copyright (c) 2023, Stalwart Labs Ltd. * * This file is part of Stalwart Mail Server. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * in the LICENSE file at the top-level directory of this distribution. * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . * * You can be released from the requirements of the AGPLv3 license by * purchasing a commercial license. Please contact licensing@stalw.art * for more details. */ use std::{borrow::Cow, vec::IntoIter}; use jieba_rs::Jieba; use super::{InnerToken, Token}; use lazy_static::lazy_static; lazy_static! { pub static ref JIEBA: Jieba = Jieba::new(); } pub struct ChineseTokenizer<'x, T, I> where T: Iterator>, I: InnerToken<'x>, { tokenizer: T, tokens: IntoIter>, phantom: std::marker::PhantomData<&'x str>, } impl<'x, T, I> ChineseTokenizer<'x, T, I> where T: Iterator>, I: InnerToken<'x>, { pub fn new(tokenizer: T) -> Self { ChineseTokenizer { tokenizer, tokens: Vec::new().into_iter(), phantom: std::marker::PhantomData, } } } impl<'x, T, I> Iterator for ChineseTokenizer<'x, T, I> where T: Iterator>, I: InnerToken<'x>, { type Item = Token; fn next(&mut self) -> Option { loop { if let Some(token) = self.tokens.next() { return Some(token); } else { let token = self.tokenizer.next()?; if token.word.is_alphabetic_8bit() { let mut token_to = token.from; match token.word.unwrap_alphabetic() { Cow::Borrowed(word) => { self.tokens = JIEBA .cut(word, false) .into_iter() .map(|word| { let token_from = token_to; token_to += word.len(); Token { word: I::new_alphabetic(word), from: token_from, to: token_to, } }) .collect::>() .into_iter(); } Cow::Owned(word) => { self.tokens = JIEBA .cut(&word, false) .into_iter() .map(|word| { let token_from = token_to; token_to += word.len(); Token { word: I::new_alphabetic(word.to_string()), from: token_from, to: token_to, } }) .collect::>() .into_iter(); } } } else { return token.into(); } } } } } #[cfg(test)] mod tests { use crate::tokenizers::{chinese::ChineseTokenizer, word::WordTokenizer, Token}; #[test] fn chinese_tokenizer() { assert_eq!( ChineseTokenizer::new(WordTokenizer::new( "孫子曰:兵者,國之大事,死生之地,存亡之道,不可不察也。", 40 ),) .collect::>(), vec![ Token { word: "孫".into(), from: 0, to: 3 }, Token { word: "子".into(), from: 3, to: 6 }, Token { word: "曰".into(), from: 6, to: 9 }, Token { word: "兵".into(), from: 12, to: 15 }, Token { word: "者".into(), from: 15, to: 18 }, Token { word: "國".into(), from: 21, to: 24 }, Token { word: "之".into(), from: 24, to: 27 }, Token { word: "大事".into(), from: 27, to: 33 }, Token { word: "死".into(), from: 36, to: 39 }, Token { word: "生".into(), from: 39, to: 42 }, Token { word: "之".into(), from: 42, to: 45 }, Token { word: "地".into(), from: 45, to: 48 }, Token { word: "存亡".into(), from: 51, to: 57 }, Token { word: "之".into(), from: 57, to: 60 }, Token { word: "道".into(), from: 60, to: 63 }, Token { word: "不可不".into(), from: 66, to: 75 }, Token { word: "察".into(), from: 75, to: 78 }, Token { word: "也".into(), from: 78, to: 81 } ] ); } }