From d25b2163113da5acf59162b375a6eabf26573a86 Mon Sep 17 00:00:00 2001 From: darkskygit Date: Wed, 26 Feb 2025 08:05:20 +0000 Subject: [PATCH] feat(server): adapt doc loader for server native (#9942) --- Cargo.lock | 6 +- docs/developing-server.md | 4 ++ packages/backend/native/Cargo.toml | 4 +- packages/backend/native/index.d.ts | 2 + packages/backend/native/index.js | 1 + packages/backend/native/src/doc_loader.rs | 64 +++++++++++++++++++ packages/backend/native/src/lib.rs | 1 + packages/common/native/Cargo.toml | 2 +- .../common/native/src/doc_loader/document.rs | 37 ++++++----- .../src/doc_loader/{loader => }/error.rs | 29 +++++---- .../native/src/doc_loader/loader/docx.rs | 2 +- .../native/src/doc_loader/loader/html.rs | 2 +- .../native/src/doc_loader/loader/mod.rs | 9 +-- .../native/src/doc_loader/loader/pdf.rs | 5 +- .../src/doc_loader/loader/source/mod.rs | 2 +- .../native/src/doc_loader/loader/text.rs | 2 +- packages/common/native/src/doc_loader/mod.rs | 4 +- 17 files changed, 121 insertions(+), 55 deletions(-) create mode 100644 packages/backend/native/src/doc_loader.rs rename packages/common/native/src/doc_loader/{loader => }/error.rs (58%) diff --git a/Cargo.lock b/Cargo.lock index 759793e124..56bb45bcec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3110,7 +3110,6 @@ version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ - "indexmap", "itoa", "memchr", "ryu", @@ -3912,14 +3911,13 @@ dependencies = [ [[package]] name = "tree-sitter" -version = "0.25.1" +version = "0.24.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a802c93485fb6781d27e27cb5927f6b00ff8d26b56c70af87267be7e99def97" +checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75" dependencies = [ "cc", "regex", "regex-syntax 0.8.5", - "serde_json", "streaming-iterator", "tree-sitter-language", ] diff --git a/docs/developing-server.md b/docs/developing-server.md index b0d9a111ba..99bd751ec3 100644 --- a/docs/developing-server.md +++ b/docs/developing-server.md @@ -22,6 +22,10 @@ cp ./.docker/dev/.env.example ./.docker/dev/.env docker compose -f ./.docker/dev/compose.yml up ``` +#### Notify + +> Starting from AFFiNE 0.20, compose.yml includes a breaking change: the default database image has switched from `postgres:16` to `pgvector/pgvector:pg16`. If you were previously using another major version of Postgres, please change the number after `pgvector/pgvector:pg` to the major version you are using. + ## Build native packages (you need to setup rust toolchain first) Server also requires native packages to be built, you can build them by running the following command: diff --git a/packages/backend/native/Cargo.toml b/packages/backend/native/Cargo.toml index f1c55f33ad..2f88c678bd 100644 --- a/packages/backend/native/Cargo.toml +++ b/packages/backend/native/Cargo.toml @@ -7,10 +7,10 @@ version = "1.0.0" crate-type = ["cdylib"] [dependencies] -affine_common = { workspace = true } +affine_common = { workspace = true, features = ["doc-loader"] } chrono = { workspace = true } file-format = { workspace = true } -napi = { workspace = true } +napi = { workspace = true, features = ["async"] } napi-derive = { workspace = true } rand = { workspace = true } sha3 = { workspace = true } diff --git a/packages/backend/native/index.d.ts b/packages/backend/native/index.d.ts index c5d2e4e1d0..6e4498a808 100644 --- a/packages/backend/native/index.d.ts +++ b/packages/backend/native/index.d.ts @@ -18,4 +18,6 @@ export declare function mergeUpdatesInApplyWay(updates: Array): Buffer export declare function mintChallengeResponse(resource: string, bits?: number | undefined | null): Promise +export declare function parseDoc(filePath: string, doc: Buffer): Promise<{ name: string, chunks: Array<{index: number, content: string}> }> + export declare function verifyChallengeResponse(response: string, bits: number, resource: string): Promise diff --git a/packages/backend/native/index.js b/packages/backend/native/index.js index 5c4b150f0e..7a93344621 100644 --- a/packages/backend/native/index.js +++ b/packages/backend/native/index.js @@ -12,3 +12,4 @@ export const getMime = binding.getMime; export const Tokenizer = binding.Tokenizer; export const fromModelName = binding.fromModelName; export const htmlSanitize = binding.htmlSanitize; +export const parseDoc = binding.parseDoc; diff --git a/packages/backend/native/src/doc_loader.rs b/packages/backend/native/src/doc_loader.rs new file mode 100644 index 0000000000..f35113ec30 --- /dev/null +++ b/packages/backend/native/src/doc_loader.rs @@ -0,0 +1,64 @@ +use affine_common::doc_loader::Doc; +use napi::{ + anyhow::anyhow, + bindgen_prelude::{AsyncTask, Buffer}, + Env, JsObject, Result, Task, +}; + +pub struct Document { + inner: Doc, +} + +impl Document { + fn name(&self) -> String { + self.inner.name.clone() + } + + fn chunks(&self, env: Env) -> Result { + let mut array = env.create_array_with_length(self.inner.chunks.len())?; + for (i, chunk) in self.inner.chunks.iter().enumerate() { + let mut obj = env.create_object()?; + obj.set_named_property("index", i as i64)?; + obj.set_named_property("content", chunk.content.clone())?; + array.set_element(i as u32, obj)?; + } + Ok(array) + } + + fn resolve(self, env: Env) -> Result { + let mut obj = env.create_object()?; + obj.set_named_property("name", self.name())?; + obj.set_named_property("chunks", self.chunks(env)?)?; + Ok(obj) + } +} + +pub struct AsyncParseDocResponse { + file_path: String, + doc: Vec, +} + +#[napi] +impl Task for AsyncParseDocResponse { + type Output = Document; + type JsValue = JsObject; + + fn compute(&mut self) -> Result { + let doc = Doc::new(&self.file_path, &self.doc).map_err(|e| anyhow!(e))?; + Ok(Document { inner: doc }) + } + + fn resolve(&mut self, env: Env, doc: Document) -> Result { + doc.resolve(env) + } +} + +#[napi( + ts_return_type = "Promise<{ name: string, chunks: Array<{index: number, content: string}> }>" +)] +pub fn parse_doc(file_path: String, doc: Buffer) -> AsyncTask { + AsyncTask::new(AsyncParseDocResponse { + file_path, + doc: doc.to_vec(), + }) +} diff --git a/packages/backend/native/src/lib.rs b/packages/backend/native/src/lib.rs index 8e18135c50..0c8d9c550a 100644 --- a/packages/backend/native/src/lib.rs +++ b/packages/backend/native/src/lib.rs @@ -1,5 +1,6 @@ #![deny(clippy::all)] +pub mod doc_loader; pub mod file_type; pub mod hashcash; pub mod html_sanitize; diff --git a/packages/common/native/Cargo.toml b/packages/common/native/Cargo.toml index dd3992ca07..5e39314c4b 100644 --- a/packages/common/native/Cargo.toml +++ b/packages/common/native/Cargo.toml @@ -36,7 +36,7 @@ serde_json = { version = "1.0", optional = true } strum_macros = { version = "0.26.2", optional = true } text-splitter = { version = "0.22", features = ["markdown", "tiktoken-rs"], optional = true } thiserror = { version = "1", optional = true } -tree-sitter = { version = "0.25", optional = true } +tree-sitter = { version = "0.24", optional = true } tree-sitter-c = { version = "0.23", optional = true } tree-sitter-c-sharp = { version = "0.23", optional = true } tree-sitter-cpp = { version = "0.23", optional = true } diff --git a/packages/common/native/src/doc_loader/document.rs b/packages/common/native/src/doc_loader/document.rs index 881767d4b9..d6c3166c92 100644 --- a/packages/common/native/src/doc_loader/document.rs +++ b/packages/common/native/src/doc_loader/document.rs @@ -30,11 +30,11 @@ pub struct Doc { } impl Doc { - pub fn new(file_path: &str, doc: &[u8]) -> Option { + pub fn new(file_path: &str, doc: &[u8]) -> LoaderResult { Self::with_options(file_path, doc, DocOptions::default()) } - pub fn with_options(file_path: &str, doc: &[u8], options: DocOptions) -> Option { + pub fn with_options(file_path: &str, doc: &[u8], options: DocOptions) -> LoaderResult { if let Some(kind) = infer::get(&doc[..4096.min(doc.len())]).or(infer::get_from_path(file_path).ok().flatten()) { @@ -58,25 +58,25 @@ impl Doc { "md" => { let loader = TextLoader::new(string); let splitter = MarkdownSplitter::default(); - return Self::from_loader(file_path, loader, splitter).ok(); + return Self::from_loader(file_path, loader, splitter); } "rs" | "c" | "cpp" | "h" | "hpp" | "js" | "ts" | "tsx" | "go" | "py" => { let name = path.full_str().to_string(); let loader = SourceCodeLoader::from_string(string).with_parser_option(LanguageParserOptions { - language: get_language_by_filename(&name).ok()?, + language: get_language_by_filename(&name)?, parser_threshold: options.code_threshold, }); let splitter = TokenSplitter::default(); - return Self::from_loader(file_path, loader, splitter).ok(); + return Self::from_loader(file_path, loader, splitter); } _ => {} } let loader = TextLoader::new(string); let splitter = TokenSplitter::default(); - return Self::from_loader(file_path, loader, splitter).ok(); + return Self::from_loader(file_path, loader, splitter); } - None + Err(LoaderError::Other("Failed to infer document type".into())) } fn from_loader( @@ -107,27 +107,26 @@ impl Doc { ) } - fn load_docx(file_path: &str, doc: &[u8]) -> Option { - let loader = DocxLoader::new(Cursor::new(doc))?; + fn load_docx(file_path: &str, doc: &[u8]) -> LoaderResult { + let loader = DocxLoader::new(Cursor::new(doc)) + .ok_or(LoaderError::Other("Failed to parse docx document".into()))?; let splitter = TokenSplitter::default(); - Self::from_loader(file_path, loader, splitter).ok() + Self::from_loader(file_path, loader, splitter) } - fn load_html(file_path: &str, doc: &[u8]) -> Option { + fn load_html(file_path: &str, doc: &[u8]) -> LoaderResult { let loader = HtmlLoader::from_string( - String::from_utf8(doc.to_vec()).ok()?, - Url::parse(file_path) - .or(Url::parse("https://example.com/")) - .ok()?, + String::from_utf8(doc.to_vec())?, + Url::parse(file_path).or(Url::parse("https://example.com/"))?, ); let splitter = TokenSplitter::default(); - Self::from_loader(file_path, loader, splitter).ok() + Self::from_loader(file_path, loader, splitter) } - fn load_pdf(file_path: &str, doc: &[u8]) -> Option { - let loader = PdfExtractLoader::new(Cursor::new(doc)).ok()?; + fn load_pdf(file_path: &str, doc: &[u8]) -> LoaderResult { + let loader = PdfExtractLoader::new(Cursor::new(doc))?; let splitter = TokenSplitter::default(); - Self::from_loader(file_path, loader, splitter).ok() + Self::from_loader(file_path, loader, splitter) } } diff --git a/packages/common/native/src/doc_loader/loader/error.rs b/packages/common/native/src/doc_loader/error.rs similarity index 58% rename from packages/common/native/src/doc_loader/loader/error.rs rename to packages/common/native/src/doc_loader/error.rs index 94ded5ece9..c3e76e8db3 100644 --- a/packages/common/native/src/doc_loader/loader/error.rs +++ b/packages/common/native/src/doc_loader/error.rs @@ -10,33 +10,34 @@ use super::*; #[derive(Error, Debug)] pub enum LoaderError { #[error("{0}")] - TextSplitterError(#[from] TextSplitterError), + TextSplitter(#[from] TextSplitterError), #[error(transparent)] - IOError(#[from] io::Error), + IO(#[from] io::Error), #[error(transparent)] - Utf8Error(#[from] Utf8Error), + Utf8(#[from] Utf8Error), #[error(transparent)] - FromUtf8Error(#[from] FromUtf8Error), - - #[cfg(feature = "pdf-extract")] - #[error(transparent)] - PdfExtractError(#[from] pdf_extract::Error), - - #[cfg(feature = "pdf-extract")] - #[error(transparent)] - PdfExtractOutputError(#[from] pdf_extract::OutputError), + FromUtf8(#[from] FromUtf8Error), #[error(transparent)] - ReadabilityError(#[from] readability::error::Error), + PdfExtract(#[from] pdf_extract::Error), + + #[error(transparent)] + PdfExtractOutput(#[from] pdf_extract::OutputError), + + #[error(transparent)] + Readability(#[from] readability::error::Error), + + #[error(transparent)] + UrlParse(#[from] url::ParseError), #[error("Unsupported source language")] UnsupportedLanguage, #[error("Error: {0}")] - OtherError(String), + Other(String), } pub type LoaderResult = Result; diff --git a/packages/common/native/src/doc_loader/loader/docx.rs b/packages/common/native/src/doc_loader/loader/docx.rs index 1b989ff227..521a4b373c 100644 --- a/packages/common/native/src/doc_loader/loader/docx.rs +++ b/packages/common/native/src/doc_loader/loader/docx.rs @@ -24,7 +24,7 @@ impl DocxLoader { } impl Loader for DocxLoader { - fn load(self) -> Result, LoaderError> { + fn load(self) -> LoaderResult> { let doc = self.extract_text_to_doc(); Ok(vec![doc]) } diff --git a/packages/common/native/src/doc_loader/loader/html.rs b/packages/common/native/src/doc_loader/loader/html.rs index 347e8a9308..a9c15a9e84 100644 --- a/packages/common/native/src/doc_loader/loader/html.rs +++ b/packages/common/native/src/doc_loader/loader/html.rs @@ -27,7 +27,7 @@ impl HtmlLoader { } impl Loader for HtmlLoader { - fn load(mut self) -> Result, LoaderError> { + fn load(mut self) -> LoaderResult> { let cleaned_html = readability::extractor::extract(&mut self.html, &self.url)?; let doc = Document::new(format!("{}\n{}", cleaned_html.title, cleaned_html.text)).with_metadata( diff --git a/packages/common/native/src/doc_loader/loader/mod.rs b/packages/common/native/src/doc_loader/loader/mod.rs index 2b26d2ddf4..8a36c23312 100644 --- a/packages/common/native/src/doc_loader/loader/mod.rs +++ b/packages/common/native/src/doc_loader/loader/mod.rs @@ -1,5 +1,4 @@ mod docx; -mod error; mod html; mod pdf; mod source; @@ -11,11 +10,8 @@ use super::*; // modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders pub trait Loader: Send + Sync { - fn load(self) -> Result, LoaderError>; - fn load_and_split( - self, - splitter: TS, - ) -> Result, LoaderError> + fn load(self) -> LoaderResult>; + fn load_and_split(self, splitter: TS) -> LoaderResult> where Self: Sized, { @@ -25,7 +21,6 @@ pub trait Loader: Send + Sync { } pub use docx::DocxLoader; -pub use error::{LoaderError, LoaderResult}; pub use html::HtmlLoader; pub use pdf::PdfExtractLoader; pub use source::{get_language_by_filename, LanguageParserOptions, SourceCodeLoader}; diff --git a/packages/common/native/src/doc_loader/loader/pdf.rs b/packages/common/native/src/doc_loader/loader/pdf.rs index 83a240c469..358b0c02de 100644 --- a/packages/common/native/src/doc_loader/loader/pdf.rs +++ b/packages/common/native/src/doc_loader/loader/pdf.rs @@ -12,8 +12,7 @@ pub struct PdfExtractLoader { impl PdfExtractLoader { pub fn new(reader: R) -> Result { - let document = pdf_extract::Document::load_from(reader) - .map_err(|e| LoaderError::OtherError(e.to_string()))?; + let document = pdf_extract::Document::load_from(reader)?; Ok(Self { document }) } } @@ -38,7 +37,7 @@ impl PdfExtractLoader { } impl Loader for PdfExtractLoader { - fn load(self) -> Result, LoaderError> { + fn load(self) -> LoaderResult> { let doc = self.extract_text_to_doc()?; Ok(vec![doc]) } diff --git a/packages/common/native/src/doc_loader/loader/source/mod.rs b/packages/common/native/src/doc_loader/loader/source/mod.rs index 4d93c75f96..2c549b1fcc 100644 --- a/packages/common/native/src/doc_loader/loader/source/mod.rs +++ b/packages/common/native/src/doc_loader/loader/source/mod.rs @@ -30,7 +30,7 @@ impl SourceCodeLoader { } impl Loader for SourceCodeLoader { - fn load(self) -> Result, LoaderError> { + fn load(self) -> LoaderResult> { let options = self.parser_option.clone(); let docs = LanguageParser::from_language(options.language) diff --git a/packages/common/native/src/doc_loader/loader/text.rs b/packages/common/native/src/doc_loader/loader/text.rs index 0cb42e205d..62fdfde89b 100644 --- a/packages/common/native/src/doc_loader/loader/text.rs +++ b/packages/common/native/src/doc_loader/loader/text.rs @@ -17,7 +17,7 @@ impl TextLoader { } impl Loader for TextLoader { - fn load(self) -> Result, LoaderError> { + fn load(self) -> LoaderResult> { let doc = Document::new(self.content); Ok(vec![doc]) } diff --git a/packages/common/native/src/doc_loader/mod.rs b/packages/common/native/src/doc_loader/mod.rs index 625a898e09..c164cb8ce8 100644 --- a/packages/common/native/src/doc_loader/mod.rs +++ b/packages/common/native/src/doc_loader/mod.rs @@ -1,11 +1,13 @@ mod document; +mod error; mod loader; mod splitter; mod types; pub use document::{Chunk, Doc}; +pub use error::{LoaderError, LoaderResult}; use loader::{ - get_language_by_filename, DocxLoader, HtmlLoader, LanguageParserOptions, Loader, LoaderError, + get_language_by_filename, DocxLoader, HtmlLoader, LanguageParserOptions, Loader, PdfExtractLoader, SourceCodeLoader, TextLoader, Url, }; use splitter::{MarkdownSplitter, TextSplitter, TextSplitterError, TokenSplitter};