feat(server): adapt doc loader for server native (#9942)

This commit is contained in:
darkskygit
2025-02-26 08:05:20 +00:00
parent e1fd8f5d80
commit d25b216311
17 changed files with 121 additions and 55 deletions

6
Cargo.lock generated
View File

@@ -3110,7 +3110,6 @@ version = "1.0.138"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949"
dependencies = [ dependencies = [
"indexmap",
"itoa", "itoa",
"memchr", "memchr",
"ryu", "ryu",
@@ -3912,14 +3911,13 @@ dependencies = [
[[package]] [[package]]
name = "tree-sitter" name = "tree-sitter"
version = "0.25.1" version = "0.24.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a802c93485fb6781d27e27cb5927f6b00ff8d26b56c70af87267be7e99def97" checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75"
dependencies = [ dependencies = [
"cc", "cc",
"regex", "regex",
"regex-syntax 0.8.5", "regex-syntax 0.8.5",
"serde_json",
"streaming-iterator", "streaming-iterator",
"tree-sitter-language", "tree-sitter-language",
] ]

View File

@@ -22,6 +22,10 @@ cp ./.docker/dev/.env.example ./.docker/dev/.env
docker compose -f ./.docker/dev/compose.yml up docker compose -f ./.docker/dev/compose.yml up
``` ```
#### Notify
> Starting from AFFiNE 0.20, compose.yml includes a breaking change: the default database image has switched from `postgres:16` to `pgvector/pgvector:pg16`. If you were previously using another major version of Postgres, please change the number after `pgvector/pgvector:pg` to the major version you are using.
## Build native packages (you need to setup rust toolchain first) ## Build native packages (you need to setup rust toolchain first)
Server also requires native packages to be built, you can build them by running the following command: Server also requires native packages to be built, you can build them by running the following command:

View File

@@ -7,10 +7,10 @@ version = "1.0.0"
crate-type = ["cdylib"] crate-type = ["cdylib"]
[dependencies] [dependencies]
affine_common = { workspace = true } affine_common = { workspace = true, features = ["doc-loader"] }
chrono = { workspace = true } chrono = { workspace = true }
file-format = { workspace = true } file-format = { workspace = true }
napi = { workspace = true } napi = { workspace = true, features = ["async"] }
napi-derive = { workspace = true } napi-derive = { workspace = true }
rand = { workspace = true } rand = { workspace = true }
sha3 = { workspace = true } sha3 = { workspace = true }

View File

@@ -18,4 +18,6 @@ export declare function mergeUpdatesInApplyWay(updates: Array<Buffer>): Buffer
export declare function mintChallengeResponse(resource: string, bits?: number | undefined | null): Promise<string> export declare function mintChallengeResponse(resource: string, bits?: number | undefined | null): Promise<string>
export declare function parseDoc(filePath: string, doc: Buffer): Promise<{ name: string, chunks: Array<{index: number, content: string}> }>
export declare function verifyChallengeResponse(response: string, bits: number, resource: string): Promise<boolean> export declare function verifyChallengeResponse(response: string, bits: number, resource: string): Promise<boolean>

View File

@@ -12,3 +12,4 @@ export const getMime = binding.getMime;
export const Tokenizer = binding.Tokenizer; export const Tokenizer = binding.Tokenizer;
export const fromModelName = binding.fromModelName; export const fromModelName = binding.fromModelName;
export const htmlSanitize = binding.htmlSanitize; export const htmlSanitize = binding.htmlSanitize;
export const parseDoc = binding.parseDoc;

View File

@@ -0,0 +1,64 @@
use affine_common::doc_loader::Doc;
use napi::{
anyhow::anyhow,
bindgen_prelude::{AsyncTask, Buffer},
Env, JsObject, Result, Task,
};
pub struct Document {
inner: Doc,
}
impl Document {
fn name(&self) -> String {
self.inner.name.clone()
}
fn chunks(&self, env: Env) -> Result<JsObject> {
let mut array = env.create_array_with_length(self.inner.chunks.len())?;
for (i, chunk) in self.inner.chunks.iter().enumerate() {
let mut obj = env.create_object()?;
obj.set_named_property("index", i as i64)?;
obj.set_named_property("content", chunk.content.clone())?;
array.set_element(i as u32, obj)?;
}
Ok(array)
}
fn resolve(self, env: Env) -> Result<JsObject> {
let mut obj = env.create_object()?;
obj.set_named_property("name", self.name())?;
obj.set_named_property("chunks", self.chunks(env)?)?;
Ok(obj)
}
}
pub struct AsyncParseDocResponse {
file_path: String,
doc: Vec<u8>,
}
#[napi]
impl Task for AsyncParseDocResponse {
type Output = Document;
type JsValue = JsObject;
fn compute(&mut self) -> Result<Self::Output> {
let doc = Doc::new(&self.file_path, &self.doc).map_err(|e| anyhow!(e))?;
Ok(Document { inner: doc })
}
fn resolve(&mut self, env: Env, doc: Document) -> Result<Self::JsValue> {
doc.resolve(env)
}
}
#[napi(
ts_return_type = "Promise<{ name: string, chunks: Array<{index: number, content: string}> }>"
)]
pub fn parse_doc(file_path: String, doc: Buffer) -> AsyncTask<AsyncParseDocResponse> {
AsyncTask::new(AsyncParseDocResponse {
file_path,
doc: doc.to_vec(),
})
}

View File

@@ -1,5 +1,6 @@
#![deny(clippy::all)] #![deny(clippy::all)]
pub mod doc_loader;
pub mod file_type; pub mod file_type;
pub mod hashcash; pub mod hashcash;
pub mod html_sanitize; pub mod html_sanitize;

View File

@@ -36,7 +36,7 @@ serde_json = { version = "1.0", optional = true }
strum_macros = { version = "0.26.2", optional = true } strum_macros = { version = "0.26.2", optional = true }
text-splitter = { version = "0.22", features = ["markdown", "tiktoken-rs"], optional = true } text-splitter = { version = "0.22", features = ["markdown", "tiktoken-rs"], optional = true }
thiserror = { version = "1", optional = true } thiserror = { version = "1", optional = true }
tree-sitter = { version = "0.25", optional = true } tree-sitter = { version = "0.24", optional = true }
tree-sitter-c = { version = "0.23", optional = true } tree-sitter-c = { version = "0.23", optional = true }
tree-sitter-c-sharp = { version = "0.23", optional = true } tree-sitter-c-sharp = { version = "0.23", optional = true }
tree-sitter-cpp = { version = "0.23", optional = true } tree-sitter-cpp = { version = "0.23", optional = true }

View File

@@ -30,11 +30,11 @@ pub struct Doc {
} }
impl Doc { impl Doc {
pub fn new(file_path: &str, doc: &[u8]) -> Option<Self> { pub fn new(file_path: &str, doc: &[u8]) -> LoaderResult<Self> {
Self::with_options(file_path, doc, DocOptions::default()) Self::with_options(file_path, doc, DocOptions::default())
} }
pub fn with_options(file_path: &str, doc: &[u8], options: DocOptions) -> Option<Self> { pub fn with_options(file_path: &str, doc: &[u8], options: DocOptions) -> LoaderResult<Self> {
if let Some(kind) = if let Some(kind) =
infer::get(&doc[..4096.min(doc.len())]).or(infer::get_from_path(file_path).ok().flatten()) infer::get(&doc[..4096.min(doc.len())]).or(infer::get_from_path(file_path).ok().flatten())
{ {
@@ -58,25 +58,25 @@ impl Doc {
"md" => { "md" => {
let loader = TextLoader::new(string); let loader = TextLoader::new(string);
let splitter = MarkdownSplitter::default(); let splitter = MarkdownSplitter::default();
return Self::from_loader(file_path, loader, splitter).ok(); return Self::from_loader(file_path, loader, splitter);
} }
"rs" | "c" | "cpp" | "h" | "hpp" | "js" | "ts" | "tsx" | "go" | "py" => { "rs" | "c" | "cpp" | "h" | "hpp" | "js" | "ts" | "tsx" | "go" | "py" => {
let name = path.full_str().to_string(); let name = path.full_str().to_string();
let loader = let loader =
SourceCodeLoader::from_string(string).with_parser_option(LanguageParserOptions { SourceCodeLoader::from_string(string).with_parser_option(LanguageParserOptions {
language: get_language_by_filename(&name).ok()?, language: get_language_by_filename(&name)?,
parser_threshold: options.code_threshold, parser_threshold: options.code_threshold,
}); });
let splitter = TokenSplitter::default(); let splitter = TokenSplitter::default();
return Self::from_loader(file_path, loader, splitter).ok(); return Self::from_loader(file_path, loader, splitter);
} }
_ => {} _ => {}
} }
let loader = TextLoader::new(string); let loader = TextLoader::new(string);
let splitter = TokenSplitter::default(); let splitter = TokenSplitter::default();
return Self::from_loader(file_path, loader, splitter).ok(); return Self::from_loader(file_path, loader, splitter);
} }
None Err(LoaderError::Other("Failed to infer document type".into()))
} }
fn from_loader( fn from_loader(
@@ -107,27 +107,26 @@ impl Doc {
) )
} }
fn load_docx(file_path: &str, doc: &[u8]) -> Option<Self> { fn load_docx(file_path: &str, doc: &[u8]) -> LoaderResult<Self> {
let loader = DocxLoader::new(Cursor::new(doc))?; let loader = DocxLoader::new(Cursor::new(doc))
.ok_or(LoaderError::Other("Failed to parse docx document".into()))?;
let splitter = TokenSplitter::default(); let splitter = TokenSplitter::default();
Self::from_loader(file_path, loader, splitter).ok() Self::from_loader(file_path, loader, splitter)
} }
fn load_html(file_path: &str, doc: &[u8]) -> Option<Self> { fn load_html(file_path: &str, doc: &[u8]) -> LoaderResult<Self> {
let loader = HtmlLoader::from_string( let loader = HtmlLoader::from_string(
String::from_utf8(doc.to_vec()).ok()?, String::from_utf8(doc.to_vec())?,
Url::parse(file_path) Url::parse(file_path).or(Url::parse("https://example.com/"))?,
.or(Url::parse("https://example.com/"))
.ok()?,
); );
let splitter = TokenSplitter::default(); let splitter = TokenSplitter::default();
Self::from_loader(file_path, loader, splitter).ok() Self::from_loader(file_path, loader, splitter)
} }
fn load_pdf(file_path: &str, doc: &[u8]) -> Option<Self> { fn load_pdf(file_path: &str, doc: &[u8]) -> LoaderResult<Self> {
let loader = PdfExtractLoader::new(Cursor::new(doc)).ok()?; let loader = PdfExtractLoader::new(Cursor::new(doc))?;
let splitter = TokenSplitter::default(); let splitter = TokenSplitter::default();
Self::from_loader(file_path, loader, splitter).ok() Self::from_loader(file_path, loader, splitter)
} }
} }

View File

@@ -10,33 +10,34 @@ use super::*;
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum LoaderError { pub enum LoaderError {
#[error("{0}")] #[error("{0}")]
TextSplitterError(#[from] TextSplitterError), TextSplitter(#[from] TextSplitterError),
#[error(transparent)] #[error(transparent)]
IOError(#[from] io::Error), IO(#[from] io::Error),
#[error(transparent)] #[error(transparent)]
Utf8Error(#[from] Utf8Error), Utf8(#[from] Utf8Error),
#[error(transparent)] #[error(transparent)]
FromUtf8Error(#[from] FromUtf8Error), FromUtf8(#[from] FromUtf8Error),
#[cfg(feature = "pdf-extract")]
#[error(transparent)]
PdfExtractError(#[from] pdf_extract::Error),
#[cfg(feature = "pdf-extract")]
#[error(transparent)]
PdfExtractOutputError(#[from] pdf_extract::OutputError),
#[error(transparent)] #[error(transparent)]
ReadabilityError(#[from] readability::error::Error), PdfExtract(#[from] pdf_extract::Error),
#[error(transparent)]
PdfExtractOutput(#[from] pdf_extract::OutputError),
#[error(transparent)]
Readability(#[from] readability::error::Error),
#[error(transparent)]
UrlParse(#[from] url::ParseError),
#[error("Unsupported source language")] #[error("Unsupported source language")]
UnsupportedLanguage, UnsupportedLanguage,
#[error("Error: {0}")] #[error("Error: {0}")]
OtherError(String), Other(String),
} }
pub type LoaderResult<T> = Result<T, LoaderError>; pub type LoaderResult<T> = Result<T, LoaderError>;

View File

@@ -24,7 +24,7 @@ impl DocxLoader {
} }
impl Loader for DocxLoader { impl Loader for DocxLoader {
fn load(self) -> Result<Vec<Document>, LoaderError> { fn load(self) -> LoaderResult<Vec<Document>> {
let doc = self.extract_text_to_doc(); let doc = self.extract_text_to_doc();
Ok(vec![doc]) Ok(vec![doc])
} }

View File

@@ -27,7 +27,7 @@ impl<R: Read> HtmlLoader<R> {
} }
impl<R: Read + Send + Sync + 'static> Loader for HtmlLoader<R> { impl<R: Read + Send + Sync + 'static> Loader for HtmlLoader<R> {
fn load(mut self) -> Result<Vec<Document>, LoaderError> { fn load(mut self) -> LoaderResult<Vec<Document>> {
let cleaned_html = readability::extractor::extract(&mut self.html, &self.url)?; let cleaned_html = readability::extractor::extract(&mut self.html, &self.url)?;
let doc = let doc =
Document::new(format!("{}\n{}", cleaned_html.title, cleaned_html.text)).with_metadata( Document::new(format!("{}\n{}", cleaned_html.title, cleaned_html.text)).with_metadata(

View File

@@ -1,5 +1,4 @@
mod docx; mod docx;
mod error;
mod html; mod html;
mod pdf; mod pdf;
mod source; mod source;
@@ -11,11 +10,8 @@ use super::*;
// modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders // modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders
pub trait Loader: Send + Sync { pub trait Loader: Send + Sync {
fn load(self) -> Result<Vec<Document>, LoaderError>; fn load(self) -> LoaderResult<Vec<Document>>;
fn load_and_split<TS: TextSplitter + 'static>( fn load_and_split<TS: TextSplitter + 'static>(self, splitter: TS) -> LoaderResult<Vec<Document>>
self,
splitter: TS,
) -> Result<Vec<Document>, LoaderError>
where where
Self: Sized, Self: Sized,
{ {
@@ -25,7 +21,6 @@ pub trait Loader: Send + Sync {
} }
pub use docx::DocxLoader; pub use docx::DocxLoader;
pub use error::{LoaderError, LoaderResult};
pub use html::HtmlLoader; pub use html::HtmlLoader;
pub use pdf::PdfExtractLoader; pub use pdf::PdfExtractLoader;
pub use source::{get_language_by_filename, LanguageParserOptions, SourceCodeLoader}; pub use source::{get_language_by_filename, LanguageParserOptions, SourceCodeLoader};

View File

@@ -12,8 +12,7 @@ pub struct PdfExtractLoader {
impl PdfExtractLoader { impl PdfExtractLoader {
pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> { pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> {
let document = pdf_extract::Document::load_from(reader) let document = pdf_extract::Document::load_from(reader)?;
.map_err(|e| LoaderError::OtherError(e.to_string()))?;
Ok(Self { document }) Ok(Self { document })
} }
} }
@@ -38,7 +37,7 @@ impl PdfExtractLoader {
} }
impl Loader for PdfExtractLoader { impl Loader for PdfExtractLoader {
fn load(self) -> Result<Vec<Document>, LoaderError> { fn load(self) -> LoaderResult<Vec<Document>> {
let doc = self.extract_text_to_doc()?; let doc = self.extract_text_to_doc()?;
Ok(vec![doc]) Ok(vec![doc])
} }

View File

@@ -30,7 +30,7 @@ impl SourceCodeLoader {
} }
impl Loader for SourceCodeLoader { impl Loader for SourceCodeLoader {
fn load(self) -> Result<Vec<Document>, LoaderError> { fn load(self) -> LoaderResult<Vec<Document>> {
let options = self.parser_option.clone(); let options = self.parser_option.clone();
let docs = LanguageParser::from_language(options.language) let docs = LanguageParser::from_language(options.language)

View File

@@ -17,7 +17,7 @@ impl TextLoader {
} }
impl Loader for TextLoader { impl Loader for TextLoader {
fn load(self) -> Result<Vec<Document>, LoaderError> { fn load(self) -> LoaderResult<Vec<Document>> {
let doc = Document::new(self.content); let doc = Document::new(self.content);
Ok(vec![doc]) Ok(vec![doc])
} }

View File

@@ -1,11 +1,13 @@
mod document; mod document;
mod error;
mod loader; mod loader;
mod splitter; mod splitter;
mod types; mod types;
pub use document::{Chunk, Doc}; pub use document::{Chunk, Doc};
pub use error::{LoaderError, LoaderResult};
use loader::{ use loader::{
get_language_by_filename, DocxLoader, HtmlLoader, LanguageParserOptions, Loader, LoaderError, get_language_by_filename, DocxLoader, HtmlLoader, LanguageParserOptions, Loader,
PdfExtractLoader, SourceCodeLoader, TextLoader, Url, PdfExtractLoader, SourceCodeLoader, TextLoader, Url,
}; };
use splitter::{MarkdownSplitter, TextSplitter, TextSplitterError, TokenSplitter}; use splitter::{MarkdownSplitter, TextSplitter, TextSplitterError, TokenSplitter};