feat(server): use native tokenizer impl (#6960)

### Benchmark

`yarn workspace @affine/server-native bench`

```
┌─────────┬────────────┬─────────┬────────────────────┬──────────┬─────────┐
│ (index) │ Task Name  │ ops/sec │ Average Time (ns)  │ Margin   │ Samples │
├─────────┼────────────┼─────────┼────────────────────┼──────────┼─────────┤
│ 0       │ 'tiktoken' │ '5'     │ 176932518.76000002 │ '±4.71%' │ 100     │
│ 1       │ 'native'   │ '16'    │ 61041597.51000003  │ '±0.60%' │ 100     │
└─────────┴────────────┴─────────┴────────────────────┴──────────┴─────────┘
```
This commit is contained in:
Brooooooklyn
2024-05-16 07:55:10 +00:00
parent 46140039d9
commit c7ddd679fd
17 changed files with 206 additions and 39 deletions

View File

@@ -2,12 +2,17 @@
pub mod file_type;
pub mod hashcash;
pub mod tiktoken;
use std::fmt::{Debug, Display};
use napi::{bindgen_prelude::*, Error, Result, Status};
use y_octo::Doc;
#[cfg(not(target_arch = "arm"))]
#[global_allocator]
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
#[macro_use]
extern crate napi_derive;

View File

@@ -0,0 +1,30 @@
use std::collections::HashSet;
#[napi]
pub struct Tokenizer {
inner: tiktoken_rs::CoreBPE,
}
#[napi]
pub fn from_model_name(model_name: String) -> Option<Tokenizer> {
let bpe = tiktoken_rs::get_bpe_from_model(&model_name).ok()?;
Some(Tokenizer { inner: bpe })
}
#[napi]
impl Tokenizer {
#[napi]
pub fn count(&self, content: String, allowed_special: Option<Vec<String>>) -> u32 {
self
.inner
.encode(
&content,
if let Some(allowed_special) = &allowed_special {
HashSet::from_iter(allowed_special.iter().map(|s| s.as_str()))
} else {
Default::default()
},
)
.len() as u32
}
}