feat(server): use native tokenizer impl (#6960)

### Benchmark

`yarn workspace @affine/server-native bench`

```
┌─────────┬────────────┬─────────┬────────────────────┬──────────┬─────────┐
│ (index) │ Task Name  │ ops/sec │ Average Time (ns)  │ Margin   │ Samples │
├─────────┼────────────┼─────────┼────────────────────┼──────────┼─────────┤
│ 0       │ 'tiktoken' │ '5'     │ 176932518.76000002 │ '±4.71%' │ 100     │
│ 1       │ 'native'   │ '16'    │ 61041597.51000003  │ '±0.60%' │ 100     │
└─────────┴────────────┴─────────┴────────────────────┴──────────┴─────────┘
```
This commit is contained in:
Brooooooklyn
2024-05-16 07:55:10 +00:00
parent 46140039d9
commit c7ddd679fd
17 changed files with 206 additions and 39 deletions

78
Cargo.lock generated
View File

@@ -50,11 +50,13 @@ version = "1.0.0"
dependencies = [
"chrono",
"file-format",
"mimalloc",
"napi",
"napi-build",
"napi-derive",
"rand",
"sha3",
"tiktoken-rs",
"tokio",
"y-octo",
]
@@ -159,6 +161,21 @@ version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "bit-set"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
dependencies = [
"bit-vec",
]
[[package]]
name = "bit-vec"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
[[package]]
name = "bitflags"
version = "1.3.2"
@@ -195,6 +212,17 @@ dependencies = [
"generic-array",
]
[[package]]
name = "bstr"
version = "1.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706"
dependencies = [
"memchr",
"regex-automata 0.4.6",
"serde",
]
[[package]]
name = "bumpalo"
version = "3.16.0"
@@ -429,6 +457,16 @@ version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "fancy-regex"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05"
dependencies = [
"bit-set",
"regex",
]
[[package]]
name = "fastrand"
version = "2.0.2"
@@ -839,6 +877,16 @@ version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "libmimalloc-sys"
version = "0.1.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81eb4061c0582dedea1cbc7aff2240300dd6982e0239d1c99e65c1dbf4a30ba7"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "libsqlite3-sys"
version = "0.27.0"
@@ -912,6 +960,15 @@ version = "2.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
[[package]]
name = "mimalloc"
version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f41a2280ded0da56c8cf898babb86e8f10651a34adcfff190ae9a1159c6908d"
dependencies = [
"libmimalloc-sys",
]
[[package]]
name = "minimal-lexical"
version = "0.2.1"
@@ -1385,6 +1442,12 @@ version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustix"
version = "0.38.32"
@@ -1926,6 +1989,21 @@ dependencies = [
"once_cell",
]
[[package]]
name = "tiktoken-rs"
version = "0.5.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234"
dependencies = [
"anyhow",
"base64",
"bstr",
"fancy-regex",
"lazy_static",
"parking_lot",
"rustc-hash",
]
[[package]]
name = "tinyvec"
version = "1.6.0"