ctranslate2/tokenizer/
rust_tokenizers.rs

1use std::path::Path;
2
3use rust_tokenizers::tokenizer::{SentencePieceTokenizer, Tokenizer as _};
4
5pub struct SentenceTokenizer {
6    spp: SentencePieceTokenizer,
7}
8
9impl SentenceTokenizer {
10    pub fn new<P: AsRef<Path>>(path: P) -> Self {
11        let spp = SentencePieceTokenizer::from_file(path, false).unwrap();
12        Self { spp }
13    }
14}
15
16impl crate::Tokenizer for SentenceTokenizer {
17    fn encode(&self, input: &str) -> anyhow::Result<Vec<String>> {
18        let mut tokens = self.spp.tokenize(input);
19        tokens.push("</s>".to_owned());
20        Ok(tokens)
21    }
22
23    fn decode(&self, tokens: Vec<String>) -> anyhow::Result<String> {
24        Ok(self.spp.convert_tokens_to_string(tokens).trim().to_owned())
25    }
26}