From e8693a3a2511ae01b44ced59f5fba7c238d0100c Mon Sep 17 00:00:00 2001 From: DarkSky <25152247+darkskygit@users.noreply.github.com> Date: Thu, 25 Dec 2025 04:40:23 +0800 Subject: [PATCH] feat: introduce fuzzy search for native indexer (#14109) --- Cargo.lock | 41 ++- Cargo.toml | 1 + packages/common/native/Cargo.toml | 18 +- .../common/nbstore/src/impls/sqlite/db.ts | 2 +- .../common/y-octo/core/src/doc/codec/refs.rs | 12 + .../common/y-octo/core/src/doc/types/array.rs | 46 ++- packages/common/y-octo/node/src/utils.rs | 1 - .../y-octo/utils/bin/bench_result_render.rs | 6 +- .../common/y-octo/utils/bin/doc_merger.rs | 4 +- .../frontend/admin/src/modules/setup/form.tsx | 2 +- .../src/plugins/nbstore/definitions.ts | 2 +- .../apps/android/src/plugins/nbstore/index.ts | 2 +- .../ios/src/plugins/nbstore/definitions.ts | 2 +- .../apps/ios/src/plugins/nbstore/index.ts | 2 +- packages/frontend/native/index.d.ts | 1 + .../src/macos/audio_stream_basic_desc.rs | 2 +- .../media_capture/src/macos/tap_audio.rs | 65 ++--- .../native/media_capture/src/macos/utils.rs | 2 +- packages/frontend/native/nbstore/Cargo.toml | 4 +- .../src/{indexer/mod.rs => indexer.rs} | 131 +++++++-- .../nbstore/src/indexer/memory_indexer.rs | 261 ------------------ .../native/nbstore/src/indexer/tokenizer.rs | 85 ------ .../native/nbstore/src/indexer/types.rs | 79 ------ .../frontend/native/nbstore/src/storage.rs | 3 +- 24 files changed, 237 insertions(+), 537 deletions(-) rename packages/frontend/native/nbstore/src/{indexer/mod.rs => indexer.rs} (70%) delete mode 100644 packages/frontend/native/nbstore/src/indexer/memory_indexer.rs delete mode 100644 packages/frontend/native/nbstore/src/indexer/tokenizer.rs delete mode 100644 packages/frontend/native/nbstore/src/indexer/types.rs diff --git a/Cargo.lock b/Cargo.lock index bf41427865..3340b75c7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,7 +49,7 @@ dependencies = [ "assert-json-diff", "cc", "chrono", - "criterion2", + "criterion", "docx-parser", "infer", "path-ext", @@ -61,6 +61,7 @@ dependencies = [ "serde_json", "sha3", "strum_macros", + "tempfile", "text-splitter", "thiserror 2.0.12", "tiktoken-rs", @@ -153,16 +154,14 @@ dependencies = [ "bincode", "chrono", "dotenvy", - "jieba-rs", + "memory-indexer", "napi", "napi-build", "napi-derive", - "once_cell", "serde", "serde_json", "sqlx", "thiserror 2.0.12", - "tiniestsegmenter", "tokio", "uniffi", "uuid", @@ -2642,6 +2641,22 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memory-indexer" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10feba381e2eeb6582f34379d62ee0658e57f63d776698150b985bd0f38664b3" +dependencies = [ + "jieba-rs", + "once_cell", + "pinyin", + "serde", + "serde_json", + "strsim", + "unicode-normalization", + "unicode-script", +] + [[package]] name = "mimalloc" version = "0.1.46" @@ -3284,6 +3299,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pinyin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16f2611cd06a1ac239a0cea4521de9eb068a6ca110324ee00631aa68daa74fc0" + [[package]] name = "pkcs1" version = "0.7.5" @@ -4772,12 +4793,6 @@ dependencies = [ "rustc-hash 1.1.0", ] -[[package]] -name = "tiniestsegmenter" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f39721481fb54f0e9f3a1da5a6ac6063c61ec62ec828cd5e1860acce9458f40" - [[package]] name = "tinystr" version = "0.8.1" @@ -5141,6 +5156,12 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" +[[package]] +name = "unicode-script" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "383ad40bb927465ec0ce7720e033cb4ca06912855fc35db31b5755d0de75b1ee" + [[package]] name = "unicode-segmentation" version = "1.12.0" diff --git a/Cargo.toml b/Cargo.toml index 045551c809..2da3d23c86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,7 @@ resolver = "3" libc = "0.2" log = "0.4" loom = { version = "0.7", features = ["checkpoint"] } + memory-indexer = "0.2" mimalloc = "0.1" mp4parse = "0.17" nanoid = "0.4" diff --git a/packages/common/native/Cargo.toml b/packages/common/native/Cargo.toml index 391827bfbd..86342d5953 100644 --- a/packages/common/native/Cargo.toml +++ b/packages/common/native/Cargo.toml @@ -17,10 +17,11 @@ doc-loader = [ "strum_macros", "text-splitter", "thiserror", + "tiktoken-rs", "tree-sitter", "url", ] -hashcash = ["sha3", "rand"] +hashcash = ["chrono", "sha3", "rand"] tree-sitter = [ "cc", "dep:tree-sitter", @@ -39,24 +40,24 @@ tree-sitter = [ ydoc-loader = ["assert-json-diff", "serde", "serde_json", "thiserror", "y-octo"] [dependencies] -chrono = { workspace = true } -rand = { workspace = true, optional = true } -sha3 = { workspace = true, optional = true } - assert-json-diff = { workspace = true, optional = true } +chrono = { workspace = true, optional = true } docx-parser = { workspace = true, optional = true } infer = { workspace = true, optional = true } path-ext = { workspace = true, optional = true } pdf-extract = { workspace = true, optional = true } +rand = { workspace = true, optional = true } readability = { workspace = true, optional = true, default-features = false } serde = { workspace = true, optional = true, features = ["derive"] } serde_json = { workspace = true, optional = true } +sha3 = { workspace = true, optional = true } strum_macros = { workspace = true, optional = true } text-splitter = { workspace = true, features = [ "markdown", "tiktoken-rs", ], optional = true } thiserror = { workspace = true, optional = true } +tiktoken-rs = { workspace = true, optional = true } tree-sitter = { workspace = true, optional = true } tree-sitter-c = { workspace = true, optional = true } tree-sitter-c-sharp = { workspace = true, optional = true } @@ -72,11 +73,10 @@ tree-sitter-typescript = { workspace = true, optional = true } url = { workspace = true, optional = true } y-octo = { workspace = true, optional = true } -tiktoken-rs = { workspace = true } - [dev-dependencies] -criterion2 = { workspace = true } -rayon = { workspace = true } +criterion = { workspace = true } +rayon = { workspace = true } +tempfile = "3" [build-dependencies] cc = { version = "1", optional = true } diff --git a/packages/common/nbstore/src/impls/sqlite/db.ts b/packages/common/nbstore/src/impls/sqlite/db.ts index 6138ca6d51..cb22323c4a 100644 --- a/packages/common/nbstore/src/impls/sqlite/db.ts +++ b/packages/common/nbstore/src/impls/sqlite/db.ts @@ -99,7 +99,7 @@ export interface NativeDBApis { id: string, indexName: string, query: string - ) => Promise<{ id: string; score: number }[]>; + ) => Promise<{ id: string; score: number; terms: Array }[]>; ftsGetDocument: ( id: string, indexName: string, diff --git a/packages/common/y-octo/core/src/doc/codec/refs.rs b/packages/common/y-octo/core/src/doc/codec/refs.rs index 4f65e4ac38..c9196fada8 100644 --- a/packages/common/y-octo/core/src/doc/codec/refs.rs +++ b/packages/common/y-octo/core/src/doc/codec/refs.rs @@ -254,6 +254,11 @@ impl Node { let mut ritem = unsafe { rref.get_mut_unchecked() }; let llen = litem.len(); + let parent_kind = match &litem.parent { + Some(Parent::Type(ty)) => ty.ty().map(|ty| ty.kind()), + _ => None, + }; + if litem.id.client != ritem.id.client // not same delete status || litem.deleted() != ritem.deleted() @@ -277,6 +282,13 @@ impl Node { l.extend(r.drain(0..)); } (Content::String(l), Content::String(r)) => { + let allow_merge_string = + matches!(parent_kind, Some(YTypeKind::Text | YTypeKind::XMLText)); + + if !allow_merge_string { + return false; + } + *l += r; } (Content::Any(l), Content::Any(r)) => { diff --git a/packages/common/y-octo/core/src/doc/types/array.rs b/packages/common/y-octo/core/src/doc/types/array.rs index 8729f26c17..e11f8f730d 100644 --- a/packages/common/y-octo/core/src/doc/types/array.rs +++ b/packages/common/y-octo/core/src/doc/types/array.rs @@ -4,21 +4,50 @@ impl_type!(Array); impl ListType for Array {} -pub struct ArrayIter<'a>(ListIterator<'a>); +pub struct ArrayIter<'a> { + iter: ListIterator<'a>, + pending: Option, +} + +enum PendingArrayValues { + Any { values: Vec, index: usize }, +} impl Iterator for ArrayIter<'_> { type Item = Value; fn next(&mut self) -> Option { - for item in self.0.by_ref() { + loop { + if let Some(PendingArrayValues::Any { values, index }) = &mut self.pending { + if *index < values.len() { + let value = values[*index].clone(); + *index += 1; + return Some(Value::Any(value)); + } + self.pending = None; + } + + let item = self.iter.next()?; if let Some(item) = item.get() { - if item.countable() { - return Some(Value::from(&item.content)); + if !item.countable() { + continue; + } + + match &item.content { + Content::Any(values) if !values.is_empty() => { + if values.len() > 1 { + self.pending = Some(PendingArrayValues::Any { + values: values.clone(), + index: 1, + }); + } + + return Some(Value::Any(values[0].clone())); + } + _ => return Some(Value::from(&item.content)), } } } - - None } } @@ -46,7 +75,10 @@ impl Array { } pub fn iter(&self) -> ArrayIter<'_> { - ArrayIter(self.iter_item()) + ArrayIter { + iter: self.iter_item(), + pending: None, + } } pub fn push>(&mut self, val: V) -> JwstCodecResult { diff --git a/packages/common/y-octo/node/src/utils.rs b/packages/common/y-octo/node/src/utils.rs index 2ab9c2cd2a..85d8419d2d 100644 --- a/packages/common/y-octo/node/src/utils.rs +++ b/packages/common/y-octo/node/src/utils.rs @@ -78,7 +78,6 @@ pub fn get_any_from_js_object(object: Object) -> Result { }) }) { if let Ok(value) = object.get_named_property_unchecked::(&key) { - println!("key: {}", key); map.insert(key, get_any_from_js_unknown(value)?); } } diff --git a/packages/common/y-octo/utils/bin/bench_result_render.rs b/packages/common/y-octo/utils/bin/bench_result_render.rs index ecb54a743b..054335c05f 100644 --- a/packages/common/y-octo/utils/bin/bench_result_render.rs +++ b/packages/common/y-octo/utils/bin/bench_result_render.rs @@ -85,7 +85,7 @@ fn convert_to_markdown() -> impl Iterator { let (changes_dur_secs, changes_err_secs) = process_duration(&changes_duration)?; let diff = -(1.0 - changes_dur_secs / base_dur_secs) * 100.0; - difference = format!("{:+.2}%", diff); + difference = format!("{diff:+.2}%"); if is_significant( changes_dur_secs, @@ -93,7 +93,7 @@ fn convert_to_markdown() -> impl Iterator { base_dur_secs, base_err_secs, ) { - difference = format!("**{}**", difference); + difference = format!("**{difference}**"); } } @@ -128,7 +128,7 @@ fn main() { ]; for line in headers.into_iter().chain(convert_to_markdown()) { - println!("{}", line); + println!("{line}"); } println!(""); } diff --git a/packages/common/y-octo/utils/bin/doc_merger.rs b/packages/common/y-octo/utils/bin/doc_merger.rs index c43c320a6a..552ce83445 100644 --- a/packages/common/y-octo/utils/bin/doc_merger.rs +++ b/packages/common/y-octo/utils/bin/doc_merger.rs @@ -35,7 +35,7 @@ fn load_path(path: &str) -> Result>, Error> { paths.sort(); for path in paths { - println!("read {:?}", path); + println!("read {path:?}"); updates.push(read(path)?); } Ok(updates) @@ -66,7 +66,7 @@ fn jwst_merge(path: &str) { let history = doc.history().parse_store(Default::default()); println!("history: {:?}", ts.elapsed()); for history in history.iter().take(100) { - println!("history: {:?}", history); + println!("history: {history:?}"); } doc.gc().unwrap(); diff --git a/packages/frontend/admin/src/modules/setup/form.tsx b/packages/frontend/admin/src/modules/setup/form.tsx index 206fb706de..e500457b42 100644 --- a/packages/frontend/admin/src/modules/setup/form.tsx +++ b/packages/frontend/admin/src/modules/setup/form.tsx @@ -121,7 +121,7 @@ export const Form = () => { console.error(err); throw err; } - }, [emailValue, passwordValue, refreshServerConfig]); + }, [nameValue, emailValue, passwordValue, refreshServerConfig]); const onNext = useAsyncCallback(async () => { if (isCreateAdminStep) { diff --git a/packages/frontend/apps/android/src/plugins/nbstore/definitions.ts b/packages/frontend/apps/android/src/plugins/nbstore/definitions.ts index 0db7082376..a0687168b6 100644 --- a/packages/frontend/apps/android/src/plugins/nbstore/definitions.ts +++ b/packages/frontend/apps/android/src/plugins/nbstore/definitions.ts @@ -171,7 +171,7 @@ export interface NbStorePlugin { id: string; indexName: string; query: string; - }) => Promise<{ id: string; score: number }[]>; + }) => Promise<{ id: string; score: number; terms: Array }[]>; ftsGetDocument: (options: { id: string; indexName: string; diff --git a/packages/frontend/apps/android/src/plugins/nbstore/index.ts b/packages/frontend/apps/android/src/plugins/nbstore/index.ts index 881162c258..528f644942 100644 --- a/packages/frontend/apps/android/src/plugins/nbstore/index.ts +++ b/packages/frontend/apps/android/src/plugins/nbstore/index.ts @@ -369,7 +369,7 @@ export const NbStoreNativeDBApis: NativeDBApis = { id: string, indexName: string, query: string - ): Promise<{ id: string; score: number }[]> { + ): Promise<{ id: string; score: number; terms: Array }[]> { return await NbStore.ftsSearch({ id, indexName, diff --git a/packages/frontend/apps/ios/src/plugins/nbstore/definitions.ts b/packages/frontend/apps/ios/src/plugins/nbstore/definitions.ts index 0db7082376..a0687168b6 100644 --- a/packages/frontend/apps/ios/src/plugins/nbstore/definitions.ts +++ b/packages/frontend/apps/ios/src/plugins/nbstore/definitions.ts @@ -171,7 +171,7 @@ export interface NbStorePlugin { id: string; indexName: string; query: string; - }) => Promise<{ id: string; score: number }[]>; + }) => Promise<{ id: string; score: number; terms: Array }[]>; ftsGetDocument: (options: { id: string; indexName: string; diff --git a/packages/frontend/apps/ios/src/plugins/nbstore/index.ts b/packages/frontend/apps/ios/src/plugins/nbstore/index.ts index 766c2cabea..da438efde0 100644 --- a/packages/frontend/apps/ios/src/plugins/nbstore/index.ts +++ b/packages/frontend/apps/ios/src/plugins/nbstore/index.ts @@ -373,7 +373,7 @@ export const NbStoreNativeDBApis: NativeDBApis = { id: string, indexName: string, query: string - ): Promise<{ id: string; score: number }[]> { + ): Promise<{ id: string; score: number; terms: Array }[]> { return await NbStore.ftsSearch({ id, indexName, diff --git a/packages/frontend/native/index.d.ts b/packages/frontend/native/index.d.ts index 18cc0ec8d8..7ff863b118 100644 --- a/packages/frontend/native/index.d.ts +++ b/packages/frontend/native/index.d.ts @@ -148,6 +148,7 @@ export interface NativeMatch { export interface NativeSearchHit { id: string score: number + terms: Array } export interface SetBlob { diff --git a/packages/frontend/native/media_capture/src/macos/audio_stream_basic_desc.rs b/packages/frontend/native/media_capture/src/macos/audio_stream_basic_desc.rs index 314dfeec61..1fab8c626f 100644 --- a/packages/frontend/native/media_capture/src/macos/audio_stream_basic_desc.rs +++ b/packages/frontend/native/media_capture/src/macos/audio_stream_basic_desc.rs @@ -171,7 +171,7 @@ impl std::fmt::Display for AudioFormatFlags { impl std::fmt::Debug for AudioFormatFlags { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "AudioFormatFlags({})", self) + write!(f, "AudioFormatFlags({self})") } } diff --git a/packages/frontend/native/media_capture/src/macos/tap_audio.rs b/packages/frontend/native/media_capture/src/macos/tap_audio.rs index b01f67e17f..06e9ae1258 100644 --- a/packages/frontend/native/media_capture/src/macos/tap_audio.rs +++ b/packages/frontend/native/media_capture/src/macos/tap_audio.rs @@ -354,7 +354,7 @@ impl AggregateDevice { input_device_id: CFString, output_device_id: CFString, ) -> Result> { - let aggregate_device_name = CFString::new(&format!("Tap-{}", tap_id)); + let aggregate_device_name = CFString::new(&format!("Tap-{tap_id}")); let aggregate_device_uid: uuid::Uuid = CFUUID::new().into(); let aggregate_device_uid_string = aggregate_device_uid.to_string(); @@ -469,18 +469,12 @@ impl AudioTapStream { // Ignore errors as device might be disconnected let status = unsafe { AudioDeviceStop(self.input_device_id, proc_id) }; if status != 0 { - println!( - "DEBUG: WARNING: Input device stop failed with status: {}", - status - ); + println!("DEBUG: WARNING: Input device stop failed with status: {status}"); } let status = unsafe { AudioDeviceDestroyIOProcID(self.input_device_id, proc_id) }; if status != 0 { - println!( - "DEBUG: WARNING: Input device destroy IO proc failed with status: {}", - status - ); + println!("DEBUG: WARNING: Input device destroy IO proc failed with status: {status}"); } } @@ -489,18 +483,12 @@ impl AudioTapStream { // Ignore errors as device might be disconnected let status = unsafe { AudioDeviceStop(self.output_device_id, proc_id) }; if status != 0 { - println!( - "DEBUG: WARNING: Output device stop failed with status: {}", - status - ); + println!("DEBUG: WARNING: Output device stop failed with status: {status}"); } let status = unsafe { AudioDeviceDestroyIOProcID(self.output_device_id, proc_id) }; if status != 0 { - println!( - "DEBUG: WARNING: Output device destroy IO proc failed with status: {}", - status - ); + println!("DEBUG: WARNING: Output device destroy IO proc failed with status: {status}"); } } @@ -508,27 +496,18 @@ impl AudioTapStream { if device_exists { let status = unsafe { AudioDeviceDestroyIOProcID(self.device_id, self.in_proc_id) }; if status != 0 { - println!( - "DEBUG: WARNING: Destroy IO proc failed with status: {}", - status - ); + println!("DEBUG: WARNING: Destroy IO proc failed with status: {status}"); } } let status = unsafe { AudioHardwareDestroyAggregateDevice(self.device_id) }; if status != 0 { - println!( - "DEBUG: WARNING: AudioHardwareDestroyAggregateDevice failed with status: {}", - status - ); + println!("DEBUG: WARNING: AudioHardwareDestroyAggregateDevice failed with status: {status}"); } // Destroy the process tap - don't fail if this fails let status = unsafe { AudioHardwareDestroyProcessTap(self.device_id) }; if status != 0 { - println!( - "DEBUG: WARNING: AudioHardwareDestroyProcessTap failed with status: {}", - status - ); + println!("DEBUG: WARNING: AudioHardwareDestroyProcessTap failed with status: {status}"); } // destroy the queue @@ -743,10 +722,7 @@ impl AggregateDeviceManager { let stop_result = old_stream.stop(); match stop_result { Ok(_) => {} - Err(e) => println!( - "DEBUG: Error stopping old stream (proceeding anyway): {}", - e - ), + Err(e) => println!("DEBUG: Error stopping old stream (proceeding anyway): {e}"), }; drop(old_stream); // Ensure it's dropped now } @@ -757,12 +733,12 @@ impl AggregateDeviceManager { *stream_guard = Some(new_stream); } Err(e) => { - println!("DEBUG: Failed to start new stream: {}", e); + println!("DEBUG: Failed to start new stream: {e}"); } } } Err(e) => { - println!("DEBUG: Failed to create new device: {}", e); + println!("DEBUG: Failed to create new device: {e}"); } } }, @@ -786,10 +762,7 @@ impl AggregateDeviceManager { ); if status != 0 { - println!( - "DEBUG: Failed to register input device listener, status: {}", - status - ); + println!("DEBUG: Failed to register input device listener, status: {status}"); return Err(CoreAudioError::AddPropertyListenerBlockFailed(status).into()); } @@ -805,10 +778,7 @@ impl AggregateDeviceManager { ); if status != 0 { - println!( - "DEBUG: Failed to register output device listener, status: {}", - status - ); + println!("DEBUG: Failed to register output device listener, status: {status}"); // Clean up the first listener if the second one fails AudioObjectRemovePropertyListenerBlock( kAudioObjectSystemObject, @@ -907,10 +877,7 @@ impl AggregateDeviceManager { if let Some(mut stream) = stream_to_stop { match stream.stop() { Ok(_) => {} - Err(e) => println!( - "DEBUG: Error stopping stream in stop_capture (ignored): {}", - e - ), + Err(e) => println!("DEBUG: Error stopping stream in stop_capture (ignored): {e}"), } // Explicitly drop here after stopping drop(stream); @@ -960,7 +927,7 @@ impl AggregateDeviceManager { match stream.get_actual_sample_rate() { Ok(rate) => Ok(Some(rate)), Err(e) => { - println!("DEBUG: Error getting actual sample rate from stream: {}", e); + println!("DEBUG: Error getting actual sample rate from stream: {e}"); // Propagate the error Err(e) } @@ -976,7 +943,7 @@ impl Drop for AggregateDeviceManager { // Call stop_capture which handles listener cleanup and stream stopping match self.stop_capture() { Ok(_) => {} - Err(e) => println!("DEBUG: Error during stop_capture in Drop (ignored): {}", e), + Err(e) => println!("DEBUG: Error during stop_capture in Drop (ignored): {e}"), } } } diff --git a/packages/frontend/native/media_capture/src/macos/utils.rs b/packages/frontend/native/media_capture/src/macos/utils.rs index c88a005fad..a638dac063 100644 --- a/packages/frontend/native/media_capture/src/macos/utils.rs +++ b/packages/frontend/native/media_capture/src/macos/utils.rs @@ -73,8 +73,8 @@ impl BufferedResampler { } else { // interleave let out_len = out_blocks[0].len(); + #[allow(clippy::needless_range_loop)] for i in 0..out_len { - #[allow(clippy::needless_range_loop)] // apply clippy lint suggestion would regress performance for ch in 0..self.channels { interleaved_out.push(out_blocks[ch][i]); diff --git a/packages/frontend/native/nbstore/Cargo.toml b/packages/frontend/native/nbstore/Cargo.toml index f8400a39ea..08bffcc69d 100644 --- a/packages/frontend/native/nbstore/Cargo.toml +++ b/packages/frontend/native/nbstore/Cargo.toml @@ -15,10 +15,9 @@ affine_schema = { path = "../schema" } anyhow = { workspace = true } bincode = { version = "2.0.1", features = ["serde"] } chrono = { workspace = true } -jieba-rs = "0.8.1" +memory-indexer = { workspace = true } napi = { workspace = true } napi-derive = { workspace = true } -once_cell = { workspace = true } serde = { workspace = true, features = ["derive"] } sqlx = { workspace = true, default-features = false, features = [ "chrono", @@ -29,7 +28,6 @@ sqlx = { workspace = true, default-features = false, features = [ "tls-rustls", ] } thiserror = { workspace = true } -tiniestsegmenter = "0.3" tokio = { workspace = true, features = ["full"] } y-octo = { workspace = true } zstd = "0.13" diff --git a/packages/frontend/native/nbstore/src/indexer/mod.rs b/packages/frontend/native/nbstore/src/indexer.rs similarity index 70% rename from packages/frontend/native/nbstore/src/indexer/mod.rs rename to packages/frontend/native/nbstore/src/indexer.rs index 38da761112..0684850f22 100644 --- a/packages/frontend/native/nbstore/src/indexer/mod.rs +++ b/packages/frontend/native/nbstore/src/indexer.rs @@ -1,13 +1,8 @@ -mod memory_indexer; -mod tokenizer; -mod types; - -use affine_common::doc_parser::{parse_doc_from_binary, ParseError}; -pub use memory_indexer::InMemoryIndex; +use affine_common::doc_parser::{parse_doc_from_binary, BlockInfo, CrawlResult, ParseError}; +use memory_indexer::{SearchHit, SnapshotData}; +use napi_derive::napi; +use serde::Serialize; use sqlx::Row; -pub use types::{ - DocData, NativeBlockInfo, NativeCrawlResult, NativeMatch, NativeSearchHit, SnapshotData, -}; use y_octo::DocOptions; use super::{ @@ -15,6 +10,88 @@ use super::{ storage::SqliteDocStorage, }; +#[napi(object)] +#[derive(Debug, Serialize)] +pub struct NativeBlockInfo { + pub block_id: String, + pub flavour: String, + pub content: Option>, + pub blob: Option>, + pub ref_doc_id: Option>, + pub ref_info: Option>, + pub parent_flavour: Option, + pub parent_block_id: Option, + pub additional: Option, +} + +impl From for NativeBlockInfo { + fn from(value: BlockInfo) -> Self { + Self { + block_id: value.block_id, + flavour: value.flavour, + content: value.content, + blob: value.blob, + ref_doc_id: value.ref_doc_id, + ref_info: value.ref_info, + parent_flavour: value.parent_flavour, + parent_block_id: value.parent_block_id, + additional: value.additional, + } + } +} + +#[napi(object)] +#[derive(Debug, Serialize)] +pub struct NativeCrawlResult { + pub blocks: Vec, + pub title: String, + pub summary: String, +} + +impl From for NativeCrawlResult { + fn from(value: CrawlResult) -> Self { + Self { + blocks: value.blocks.into_iter().map(Into::into).collect(), + title: value.title, + summary: value.summary, + } + } +} + +#[napi(object)] +#[derive(Debug, Serialize)] +pub struct NativeSearchHit { + pub id: String, + pub score: f64, + pub terms: Vec, +} + +impl From for NativeSearchHit { + fn from(value: SearchHit) -> Self { + Self { + id: value.doc_id, + score: value.score, + terms: value.matched_terms.into_iter().map(|t| t.term).collect(), + } + } +} + +#[napi(object)] +#[derive(Debug, Serialize)] +pub struct NativeMatch { + pub start: u32, + pub end: u32, +} + +impl From<(u32, u32)> for NativeMatch { + fn from(value: (u32, u32)) -> Self { + Self { + start: value.0, + end: value.1, + } + } +} + impl SqliteDocStorage { pub async fn crawl_doc_data(&self, doc_id: &str) -> Result { let doc_bin = self @@ -53,14 +130,14 @@ impl SqliteDocStorage { { let mut index = self.index.write().await; + let config = bincode::config::standard(); for row in snapshots { let index_name: String = row.get("index_name"); let data: Vec = row.get("data"); if let Ok(decompressed) = zstd::stream::decode_all(std::io::Cursor::new(&data)) { - if let Ok((snapshot, _)) = bincode::serde::decode_from_slice::( - &decompressed, - bincode::config::standard(), - ) { + if let Ok((snapshot, _)) = + bincode::serde::decode_from_slice::(&decompressed, config) + { index.load_snapshot(&index_name, snapshot); } } @@ -79,7 +156,7 @@ impl SqliteDocStorage { if let Some(data) = snapshot_data { let blob = bincode::serde::encode_to_vec(&data, bincode::config::standard()) .map_err(|e| Error::Serialization(e.to_string()))?; - let compressed = zstd::stream::encode_all(std::io::Cursor::new(&blob), 0) + let compressed = zstd::stream::encode_all(std::io::Cursor::new(&blob), 4) .map_err(|e| Error::Serialization(e.to_string()))?; let mut tx = self.pool.begin().await?; @@ -147,9 +224,9 @@ impl SqliteDocStorage { let idx = self.index.read().await; Ok( idx - .search(index_name, query) + .search_hits(index_name, query) .into_iter() - .map(|(id, score)| NativeSearchHit { id, score }) + .map(Into::into) .collect(), ) } @@ -165,7 +242,23 @@ impl SqliteDocStorage { idx .get_matches(index_name, doc_id, query) .into_iter() - .map(|(start, end)| NativeMatch { start, end }) + .map(Into::into) + .collect(), + ) + } + + pub async fn fts_get_matches_for_terms( + &self, + index_name: &str, + doc_id: &str, + terms: Vec, + ) -> Result> { + let idx = self.index.read().await; + Ok( + idx + .get_matches_for_terms(index_name, doc_id, &terms) + .into_iter() + .map(Into::into) .collect(), ) } @@ -206,8 +299,8 @@ mod tests { use super::{super::error::Error, *}; - const DEMO_BIN: &[u8] = include_bytes!("../../../../../common/native/fixtures/demo.ydoc"); - const DEMO_JSON: &[u8] = include_bytes!("../../../../../common/native/fixtures/demo.ydoc.json"); + const DEMO_BIN: &[u8] = include_bytes!("../../../../common/native/fixtures/demo.ydoc"); + const DEMO_JSON: &[u8] = include_bytes!("../../../../common/native/fixtures/demo.ydoc.json"); fn temp_workspace_dir() -> PathBuf { std::env::temp_dir().join(format!("affine-native-{}", Uuid::new_v4())) diff --git a/packages/frontend/native/nbstore/src/indexer/memory_indexer.rs b/packages/frontend/native/nbstore/src/indexer/memory_indexer.rs deleted file mode 100644 index f4c927ce88..0000000000 --- a/packages/frontend/native/nbstore/src/indexer/memory_indexer.rs +++ /dev/null @@ -1,261 +0,0 @@ -use std::collections::{HashMap, HashSet}; - -use super::{ - tokenizer::tokenize, - types::{DocData, SnapshotData}, -}; - -type DirtyDoc = (String, String, String, i64); -type DeletedDoc = HashMap>; - -#[derive(Default, Debug)] -pub struct InMemoryIndex { - pub docs: HashMap>, - pub inverted: HashMap>>, - pub total_lens: HashMap, - pub dirty: HashMap>, - pub deleted: HashMap>, -} - -impl InMemoryIndex { - pub fn add_doc(&mut self, index_name: &str, doc_id: &str, text: &str, index: bool) { - let tokens = if index { tokenize(text) } else { vec![] }; - // doc_len should be the number of tokens (including duplicates) - let doc_len = tokens.len() as i64; - - let mut pos_map: HashMap> = HashMap::new(); - for token in tokens { - pos_map - .entry(token.term) - .or_default() - .push((token.start as u32, token.end as u32)); - } - - if let Some(docs) = self.docs.get_mut(index_name) { - if let Some(old_data) = docs.remove(doc_id) { - *self.total_lens.entry(index_name.to_string()).or_default() -= old_data.doc_len; - - if let Some(inverted) = self.inverted.get_mut(index_name) { - for (term, _) in old_data.term_pos { - if let Some(doc_map) = inverted.get_mut(&term) { - doc_map.remove(doc_id); - if doc_map.is_empty() { - inverted.remove(&term); - } - } - } - } - } - } - - let doc_data = DocData { - content: text.to_string(), - doc_len, - term_pos: pos_map.clone(), - }; - - self - .docs - .entry(index_name.to_string()) - .or_default() - .insert(doc_id.to_string(), doc_data); - *self.total_lens.entry(index_name.to_string()).or_default() += doc_len; - - let inverted = self.inverted.entry(index_name.to_string()).or_default(); - for (term, positions) in pos_map { - inverted - .entry(term) - .or_default() - .insert(doc_id.to_string(), positions.len() as i64); - } - - self - .dirty - .entry(index_name.to_string()) - .or_default() - .insert(doc_id.to_string()); - if let Some(deleted) = self.deleted.get_mut(index_name) { - deleted.remove(doc_id); - } - } - - pub fn remove_doc(&mut self, index_name: &str, doc_id: &str) { - if let Some(docs) = self.docs.get_mut(index_name) { - if let Some(old_data) = docs.remove(doc_id) { - *self.total_lens.entry(index_name.to_string()).or_default() -= old_data.doc_len; - - if let Some(inverted) = self.inverted.get_mut(index_name) { - for (term, _) in old_data.term_pos { - if let Some(doc_map) = inverted.get_mut(&term) { - doc_map.remove(doc_id); - if doc_map.is_empty() { - inverted.remove(&term); - } - } - } - } - - self - .deleted - .entry(index_name.to_string()) - .or_default() - .insert(doc_id.to_string()); - if let Some(dirty) = self.dirty.get_mut(index_name) { - dirty.remove(doc_id); - } - } - } - } - - pub fn get_doc(&self, index_name: &str, doc_id: &str) -> Option { - self - .docs - .get(index_name) - .and_then(|docs| docs.get(doc_id)) - .map(|d| d.content.clone()) - } - - pub fn search(&self, index_name: &str, query: &str) -> Vec<(String, f64)> { - if query == "*" || query.is_empty() { - if let Some(docs) = self.docs.get(index_name) { - return docs.keys().map(|k| (k.clone(), 1.0)).collect(); - } - return vec![]; - } - - let query_terms = tokenize(query); - if query_terms.is_empty() { - return vec![]; - } - - let inverted = match self.inverted.get(index_name) { - Some(i) => i, - None => return vec![], - }; - - let mut candidates: Option> = None; - - for token in &query_terms { - if let Some(doc_map) = inverted.get(&token.term) { - let docs: HashSet = doc_map.keys().cloned().collect(); - match candidates { - None => candidates = Some(docs), - Some(ref mut c) => { - c.retain(|id| docs.contains(id)); - } - } - if candidates.as_ref().unwrap().is_empty() { - return vec![]; - } - } else { - return vec![]; - } - } - - let candidates = candidates.unwrap_or_default(); - if candidates.is_empty() { - return vec![]; - } - - let docs = self.docs.get(index_name).unwrap(); - let total_len = *self.total_lens.get(index_name).unwrap_or(&0); - let n = docs.len() as f64; - let avgdl = if n > 0.0 { total_len as f64 / n } else { 0.0 }; - - let k1 = 1.2; - let b = 0.75; - - let mut scores: Vec<(String, f64)> = Vec::with_capacity(candidates.len()); - - let mut idfs = HashMap::new(); - for token in &query_terms { - let n_q = inverted.get(&token.term).map(|m| m.len()).unwrap_or(0) as f64; - let idf = ((n - n_q + 0.5) / (n_q + 0.5) + 1.0).ln(); - idfs.insert(&token.term, idf); - } - - for doc_id in candidates { - let doc_data = docs.get(&doc_id).unwrap(); - let mut score = 0.0; - - for token in &query_terms { - if let Some(positions) = doc_data.term_pos.get(&token.term) { - let freq = positions.len() as f64; - let idf = idfs.get(&token.term).unwrap(); - let numerator = freq * (k1 + 1.0); - let denominator = freq + k1 * (1.0 - b + b * (doc_data.doc_len as f64 / avgdl)); - score += idf * (numerator / denominator); - } - } - scores.push((doc_id, score)); - } - - scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); - - scores - } - - pub fn take_dirty_and_deleted(&mut self) -> (Vec, DeletedDoc) { - let dirty = std::mem::take(&mut self.dirty); - let deleted = std::mem::take(&mut self.deleted); - - let mut dirty_data = Vec::new(); - for (index_name, doc_ids) in &dirty { - if let Some(docs) = self.docs.get(index_name) { - for doc_id in doc_ids { - if let Some(data) = docs.get(doc_id) { - dirty_data.push(( - index_name.clone(), - doc_id.clone(), - data.content.clone(), - data.doc_len, - )); - } - } - } - } - (dirty_data, deleted) - } - - pub fn get_matches(&self, index_name: &str, doc_id: &str, query: &str) -> Vec<(u32, u32)> { - let mut matches = Vec::new(); - if let Some(docs) = self.docs.get(index_name) { - if let Some(doc_data) = docs.get(doc_id) { - let query_tokens = tokenize(query); - for token in query_tokens { - if let Some(positions) = doc_data.term_pos.get(&token.term) { - matches.extend(positions.iter().cloned()); - } - } - } - } - matches.sort_by(|a, b| a.0.cmp(&b.0)); - matches - } - - pub fn load_snapshot(&mut self, index_name: &str, snapshot: SnapshotData) { - let docs = self.docs.entry(index_name.to_string()).or_default(); - let inverted = self.inverted.entry(index_name.to_string()).or_default(); - let total_len = self.total_lens.entry(index_name.to_string()).or_default(); - - for (doc_id, doc_data) in snapshot.docs { - *total_len += doc_data.doc_len; - - for (term, positions) in &doc_data.term_pos { - inverted - .entry(term.clone()) - .or_default() - .insert(doc_id.clone(), positions.len() as i64); - } - - docs.insert(doc_id, doc_data); - } - } - - pub fn get_snapshot_data(&self, index_name: &str) -> Option { - self - .docs - .get(index_name) - .map(|docs| SnapshotData { docs: docs.clone() }) - } -} diff --git a/packages/frontend/native/nbstore/src/indexer/tokenizer.rs b/packages/frontend/native/nbstore/src/indexer/tokenizer.rs deleted file mode 100644 index 5a4753ad1b..0000000000 --- a/packages/frontend/native/nbstore/src/indexer/tokenizer.rs +++ /dev/null @@ -1,85 +0,0 @@ -use jieba_rs::Jieba; -use once_cell::sync::Lazy; -use tiniestsegmenter::tokenize as ts_tokenize; - -static JIEBA: Lazy = Lazy::new(Jieba::new); - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Token { - pub term: String, - pub start: usize, - pub end: usize, -} - -pub fn tokenize(text: &str) -> Vec { - let mut tokens = Vec::new(); - - // Use jieba for Chinese/English - // Jieba tokenize returns tokens with offsets - let jieba_tokens = JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, false); - for token in jieba_tokens { - if token.word.chars().any(|c| c.is_alphanumeric()) { - tokens.push(Token { - term: token.word.to_lowercase(), - start: token.start, - end: token.end, - }); - } - } - - // Use TinySegmenter for Japanese - // TinySegmenter does not provide offsets, so we have to find them manually - // This is a simplified approach and might not be perfect for repeated terms - let mut last_pos = 0; - for term in ts_tokenize(text) { - if term.chars().any(|c| c.is_alphanumeric()) { - if let Some(pos) = text[last_pos..].find(term) { - let start = last_pos + pos; - let end = start + term.len(); - tokens.push(Token { - term: term.to_lowercase(), - start, - end, - }); - last_pos = end; - } - } - } - - // Manually handle Korean bigrams and unigrams - let chars: Vec = text.chars().collect(); - let mut byte_offset = 0; - for (i, &c) in chars.iter().enumerate() { - let char_len = c.len_utf8(); - if is_hangul(c) { - tokens.push(Token { - term: c.to_string().to_lowercase(), - start: byte_offset, - end: byte_offset + char_len, - }); - if i + 1 < chars.len() { - let next = chars[i + 1]; - if is_hangul(next) { - let next_len = next.len_utf8(); - tokens.push(Token { - term: format!("{}{}", c, next).to_lowercase(), - start: byte_offset, - end: byte_offset + char_len + next_len, - }); - } - } - } - byte_offset += char_len; - } - - tokens -} - -fn is_hangul(c: char) -> bool { - // Hangul Syllables - ('\u{AC00}'..='\u{D7AF}').contains(&c) - // Hangul Jamo - || ('\u{1100}'..='\u{11FF}').contains(&c) - // Hangul Compatibility Jamo - || ('\u{3130}'..='\u{318F}').contains(&c) -} diff --git a/packages/frontend/native/nbstore/src/indexer/types.rs b/packages/frontend/native/nbstore/src/indexer/types.rs deleted file mode 100644 index 9099b0625a..0000000000 --- a/packages/frontend/native/nbstore/src/indexer/types.rs +++ /dev/null @@ -1,79 +0,0 @@ -use std::collections::HashMap; - -use affine_common::doc_parser::{BlockInfo, CrawlResult}; -use napi_derive::napi; -use serde::{Deserialize, Serialize}; - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocData { - pub content: String, - pub doc_len: i64, - pub term_pos: HashMap>, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct SnapshotData { - pub docs: HashMap, -} - -#[napi(object)] -#[derive(Debug, Serialize)] -pub struct NativeBlockInfo { - pub block_id: String, - pub flavour: String, - pub content: Option>, - pub blob: Option>, - pub ref_doc_id: Option>, - pub ref_info: Option>, - pub parent_flavour: Option, - pub parent_block_id: Option, - pub additional: Option, -} - -#[napi(object)] -#[derive(Debug, Serialize)] -pub struct NativeCrawlResult { - pub blocks: Vec, - pub title: String, - pub summary: String, -} - -#[napi(object)] -#[derive(Debug, Serialize)] -pub struct NativeSearchHit { - pub id: String, - pub score: f64, -} - -#[napi(object)] -#[derive(Debug, Serialize)] -pub struct NativeMatch { - pub start: u32, - pub end: u32, -} - -impl From for NativeBlockInfo { - fn from(value: BlockInfo) -> Self { - Self { - block_id: value.block_id, - flavour: value.flavour, - content: value.content, - blob: value.blob, - ref_doc_id: value.ref_doc_id, - ref_info: value.ref_info, - parent_flavour: value.parent_flavour, - parent_block_id: value.parent_block_id, - additional: value.additional, - } - } -} - -impl From for NativeCrawlResult { - fn from(value: CrawlResult) -> Self { - Self { - blocks: value.blocks.into_iter().map(Into::into).collect(), - title: value.title, - summary: value.summary, - } - } -} diff --git a/packages/frontend/native/nbstore/src/storage.rs b/packages/frontend/native/nbstore/src/storage.rs index 664f3db3ac..82850199f8 100644 --- a/packages/frontend/native/nbstore/src/storage.rs +++ b/packages/frontend/native/nbstore/src/storage.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use affine_schema::get_migrator; +use memory_indexer::InMemoryIndex; use sqlx::{ migrate::MigrateDatabase, sqlite::{Sqlite, SqliteConnectOptions, SqlitePoolOptions}, @@ -8,7 +9,7 @@ use sqlx::{ }; use tokio::sync::RwLock; -use super::{error::Result, indexer::InMemoryIndex}; +use super::error::Result; pub struct SqliteDocStorage { pub pool: Pool,