Files
AFFiNE-Mirror/packages/backend/native/src/tiktoken.rs
Brooooooklyn c7ddd679fd feat(server): use native tokenizer impl (#6960)
### Benchmark

`yarn workspace @affine/server-native bench`

```
┌─────────┬────────────┬─────────┬────────────────────┬──────────┬─────────┐
│ (index) │ Task Name  │ ops/sec │ Average Time (ns)  │ Margin   │ Samples │
├─────────┼────────────┼─────────┼────────────────────┼──────────┼─────────┤
│ 0       │ 'tiktoken' │ '5'     │ 176932518.76000002 │ '±4.71%' │ 100     │
│ 1       │ 'native'   │ '16'    │ 61041597.51000003  │ '±0.60%' │ 100     │
└─────────┴────────────┴─────────┴────────────────────┴──────────┴─────────┘
```
2024-05-16 07:55:10 +00:00

31 lines
669 B
Rust

use std::collections::HashSet;
#[napi]
pub struct Tokenizer {
inner: tiktoken_rs::CoreBPE,
}
#[napi]
pub fn from_model_name(model_name: String) -> Option<Tokenizer> {
let bpe = tiktoken_rs::get_bpe_from_model(&model_name).ok()?;
Some(Tokenizer { inner: bpe })
}
#[napi]
impl Tokenizer {
#[napi]
pub fn count(&self, content: String, allowed_special: Option<Vec<String>>) -> u32 {
self
.inner
.encode(
&content,
if let Some(allowed_special) = &allowed_special {
HashSet::from_iter(allowed_special.iter().map(|s| s.as_str()))
} else {
Default::default()
},
)
.len() as u32
}
}