ctranslate2/tokenizer/
rust_tokenizers.rs1use std::path::Path;
2
3use rust_tokenizers::tokenizer::{SentencePieceTokenizer, Tokenizer as _};
4
5pub struct SentenceTokenizer {
6 spp: SentencePieceTokenizer,
7}
8
9impl SentenceTokenizer {
10 pub fn new<P: AsRef<Path>>(path: P) -> Self {
11 let spp = SentencePieceTokenizer::from_file(path, false).unwrap();
12 Self { spp }
13 }
14}
15
16impl crate::Tokenizer for SentenceTokenizer {
17 fn encode(&self, input: &str) -> anyhow::Result<Vec<String>> {
18 let mut tokens = self.spp.tokenize(input);
19 tokens.push("</s>".to_owned());
20 Ok(tokens)
21 }
22
23 fn decode(&self, tokens: Vec<String>) -> anyhow::Result<String> {
24 Ok(self.spp.convert_tokens_to_string(tokens).trim().to_owned())
25 }
26}