mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-12 12:28:42 +00:00
### Benchmark `yarn workspace @affine/server-native bench` ``` ┌─────────┬────────────┬─────────┬────────────────────┬──────────┬─────────┐ │ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │ ├─────────┼────────────┼─────────┼────────────────────┼──────────┼─────────┤ │ 0 │ 'tiktoken' │ '5' │ 176932518.76000002 │ '±4.71%' │ 100 │ │ 1 │ 'native' │ '16' │ 61041597.51000003 │ '±0.60%' │ 100 │ └─────────┴────────────┴─────────┴────────────────────┴──────────┴─────────┘ ```
31 lines
669 B
Rust
31 lines
669 B
Rust
use std::collections::HashSet;
|
|
|
|
#[napi]
|
|
pub struct Tokenizer {
|
|
inner: tiktoken_rs::CoreBPE,
|
|
}
|
|
|
|
#[napi]
|
|
pub fn from_model_name(model_name: String) -> Option<Tokenizer> {
|
|
let bpe = tiktoken_rs::get_bpe_from_model(&model_name).ok()?;
|
|
Some(Tokenizer { inner: bpe })
|
|
}
|
|
|
|
#[napi]
|
|
impl Tokenizer {
|
|
#[napi]
|
|
pub fn count(&self, content: String, allowed_special: Option<Vec<String>>) -> u32 {
|
|
self
|
|
.inner
|
|
.encode(
|
|
&content,
|
|
if let Some(allowed_special) = &allowed_special {
|
|
HashSet::from_iter(allowed_special.iter().map(|s| s.as_str()))
|
|
} else {
|
|
Default::default()
|
|
},
|
|
)
|
|
.len() as u32
|
|
}
|
|
}
|