mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-13 21:05:19 +00:00
feat(server): adapt doc loader for server native (#9942)
This commit is contained in:
@@ -30,11 +30,11 @@ pub struct Doc {
|
||||
}
|
||||
|
||||
impl Doc {
|
||||
pub fn new(file_path: &str, doc: &[u8]) -> Option<Self> {
|
||||
pub fn new(file_path: &str, doc: &[u8]) -> LoaderResult<Self> {
|
||||
Self::with_options(file_path, doc, DocOptions::default())
|
||||
}
|
||||
|
||||
pub fn with_options(file_path: &str, doc: &[u8], options: DocOptions) -> Option<Self> {
|
||||
pub fn with_options(file_path: &str, doc: &[u8], options: DocOptions) -> LoaderResult<Self> {
|
||||
if let Some(kind) =
|
||||
infer::get(&doc[..4096.min(doc.len())]).or(infer::get_from_path(file_path).ok().flatten())
|
||||
{
|
||||
@@ -58,25 +58,25 @@ impl Doc {
|
||||
"md" => {
|
||||
let loader = TextLoader::new(string);
|
||||
let splitter = MarkdownSplitter::default();
|
||||
return Self::from_loader(file_path, loader, splitter).ok();
|
||||
return Self::from_loader(file_path, loader, splitter);
|
||||
}
|
||||
"rs" | "c" | "cpp" | "h" | "hpp" | "js" | "ts" | "tsx" | "go" | "py" => {
|
||||
let name = path.full_str().to_string();
|
||||
let loader =
|
||||
SourceCodeLoader::from_string(string).with_parser_option(LanguageParserOptions {
|
||||
language: get_language_by_filename(&name).ok()?,
|
||||
language: get_language_by_filename(&name)?,
|
||||
parser_threshold: options.code_threshold,
|
||||
});
|
||||
let splitter = TokenSplitter::default();
|
||||
return Self::from_loader(file_path, loader, splitter).ok();
|
||||
return Self::from_loader(file_path, loader, splitter);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
let loader = TextLoader::new(string);
|
||||
let splitter = TokenSplitter::default();
|
||||
return Self::from_loader(file_path, loader, splitter).ok();
|
||||
return Self::from_loader(file_path, loader, splitter);
|
||||
}
|
||||
None
|
||||
Err(LoaderError::Other("Failed to infer document type".into()))
|
||||
}
|
||||
|
||||
fn from_loader(
|
||||
@@ -107,27 +107,26 @@ impl Doc {
|
||||
)
|
||||
}
|
||||
|
||||
fn load_docx(file_path: &str, doc: &[u8]) -> Option<Self> {
|
||||
let loader = DocxLoader::new(Cursor::new(doc))?;
|
||||
fn load_docx(file_path: &str, doc: &[u8]) -> LoaderResult<Self> {
|
||||
let loader = DocxLoader::new(Cursor::new(doc))
|
||||
.ok_or(LoaderError::Other("Failed to parse docx document".into()))?;
|
||||
let splitter = TokenSplitter::default();
|
||||
Self::from_loader(file_path, loader, splitter).ok()
|
||||
Self::from_loader(file_path, loader, splitter)
|
||||
}
|
||||
|
||||
fn load_html(file_path: &str, doc: &[u8]) -> Option<Self> {
|
||||
fn load_html(file_path: &str, doc: &[u8]) -> LoaderResult<Self> {
|
||||
let loader = HtmlLoader::from_string(
|
||||
String::from_utf8(doc.to_vec()).ok()?,
|
||||
Url::parse(file_path)
|
||||
.or(Url::parse("https://example.com/"))
|
||||
.ok()?,
|
||||
String::from_utf8(doc.to_vec())?,
|
||||
Url::parse(file_path).or(Url::parse("https://example.com/"))?,
|
||||
);
|
||||
let splitter = TokenSplitter::default();
|
||||
Self::from_loader(file_path, loader, splitter).ok()
|
||||
Self::from_loader(file_path, loader, splitter)
|
||||
}
|
||||
|
||||
fn load_pdf(file_path: &str, doc: &[u8]) -> Option<Self> {
|
||||
let loader = PdfExtractLoader::new(Cursor::new(doc)).ok()?;
|
||||
fn load_pdf(file_path: &str, doc: &[u8]) -> LoaderResult<Self> {
|
||||
let loader = PdfExtractLoader::new(Cursor::new(doc))?;
|
||||
let splitter = TokenSplitter::default();
|
||||
Self::from_loader(file_path, loader, splitter).ok()
|
||||
Self::from_loader(file_path, loader, splitter)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,33 +10,34 @@ use super::*;
|
||||
#[derive(Error, Debug)]
|
||||
pub enum LoaderError {
|
||||
#[error("{0}")]
|
||||
TextSplitterError(#[from] TextSplitterError),
|
||||
TextSplitter(#[from] TextSplitterError),
|
||||
|
||||
#[error(transparent)]
|
||||
IOError(#[from] io::Error),
|
||||
IO(#[from] io::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
Utf8Error(#[from] Utf8Error),
|
||||
Utf8(#[from] Utf8Error),
|
||||
|
||||
#[error(transparent)]
|
||||
FromUtf8Error(#[from] FromUtf8Error),
|
||||
|
||||
#[cfg(feature = "pdf-extract")]
|
||||
#[error(transparent)]
|
||||
PdfExtractError(#[from] pdf_extract::Error),
|
||||
|
||||
#[cfg(feature = "pdf-extract")]
|
||||
#[error(transparent)]
|
||||
PdfExtractOutputError(#[from] pdf_extract::OutputError),
|
||||
FromUtf8(#[from] FromUtf8Error),
|
||||
|
||||
#[error(transparent)]
|
||||
ReadabilityError(#[from] readability::error::Error),
|
||||
PdfExtract(#[from] pdf_extract::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
PdfExtractOutput(#[from] pdf_extract::OutputError),
|
||||
|
||||
#[error(transparent)]
|
||||
Readability(#[from] readability::error::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
UrlParse(#[from] url::ParseError),
|
||||
|
||||
#[error("Unsupported source language")]
|
||||
UnsupportedLanguage,
|
||||
|
||||
#[error("Error: {0}")]
|
||||
OtherError(String),
|
||||
Other(String),
|
||||
}
|
||||
|
||||
pub type LoaderResult<T> = Result<T, LoaderError>;
|
||||
@@ -24,7 +24,7 @@ impl DocxLoader {
|
||||
}
|
||||
|
||||
impl Loader for DocxLoader {
|
||||
fn load(self) -> Result<Vec<Document>, LoaderError> {
|
||||
fn load(self) -> LoaderResult<Vec<Document>> {
|
||||
let doc = self.extract_text_to_doc();
|
||||
Ok(vec![doc])
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ impl<R: Read> HtmlLoader<R> {
|
||||
}
|
||||
|
||||
impl<R: Read + Send + Sync + 'static> Loader for HtmlLoader<R> {
|
||||
fn load(mut self) -> Result<Vec<Document>, LoaderError> {
|
||||
fn load(mut self) -> LoaderResult<Vec<Document>> {
|
||||
let cleaned_html = readability::extractor::extract(&mut self.html, &self.url)?;
|
||||
let doc =
|
||||
Document::new(format!("{}\n{}", cleaned_html.title, cleaned_html.text)).with_metadata(
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
mod docx;
|
||||
mod error;
|
||||
mod html;
|
||||
mod pdf;
|
||||
mod source;
|
||||
@@ -11,11 +10,8 @@ use super::*;
|
||||
|
||||
// modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders
|
||||
pub trait Loader: Send + Sync {
|
||||
fn load(self) -> Result<Vec<Document>, LoaderError>;
|
||||
fn load_and_split<TS: TextSplitter + 'static>(
|
||||
self,
|
||||
splitter: TS,
|
||||
) -> Result<Vec<Document>, LoaderError>
|
||||
fn load(self) -> LoaderResult<Vec<Document>>;
|
||||
fn load_and_split<TS: TextSplitter + 'static>(self, splitter: TS) -> LoaderResult<Vec<Document>>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
@@ -25,7 +21,6 @@ pub trait Loader: Send + Sync {
|
||||
}
|
||||
|
||||
pub use docx::DocxLoader;
|
||||
pub use error::{LoaderError, LoaderResult};
|
||||
pub use html::HtmlLoader;
|
||||
pub use pdf::PdfExtractLoader;
|
||||
pub use source::{get_language_by_filename, LanguageParserOptions, SourceCodeLoader};
|
||||
|
||||
@@ -12,8 +12,7 @@ pub struct PdfExtractLoader {
|
||||
|
||||
impl PdfExtractLoader {
|
||||
pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> {
|
||||
let document = pdf_extract::Document::load_from(reader)
|
||||
.map_err(|e| LoaderError::OtherError(e.to_string()))?;
|
||||
let document = pdf_extract::Document::load_from(reader)?;
|
||||
Ok(Self { document })
|
||||
}
|
||||
}
|
||||
@@ -38,7 +37,7 @@ impl PdfExtractLoader {
|
||||
}
|
||||
|
||||
impl Loader for PdfExtractLoader {
|
||||
fn load(self) -> Result<Vec<Document>, LoaderError> {
|
||||
fn load(self) -> LoaderResult<Vec<Document>> {
|
||||
let doc = self.extract_text_to_doc()?;
|
||||
Ok(vec![doc])
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ impl SourceCodeLoader {
|
||||
}
|
||||
|
||||
impl Loader for SourceCodeLoader {
|
||||
fn load(self) -> Result<Vec<Document>, LoaderError> {
|
||||
fn load(self) -> LoaderResult<Vec<Document>> {
|
||||
let options = self.parser_option.clone();
|
||||
|
||||
let docs = LanguageParser::from_language(options.language)
|
||||
|
||||
@@ -17,7 +17,7 @@ impl TextLoader {
|
||||
}
|
||||
|
||||
impl Loader for TextLoader {
|
||||
fn load(self) -> Result<Vec<Document>, LoaderError> {
|
||||
fn load(self) -> LoaderResult<Vec<Document>> {
|
||||
let doc = Document::new(self.content);
|
||||
Ok(vec![doc])
|
||||
}
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
mod document;
|
||||
mod error;
|
||||
mod loader;
|
||||
mod splitter;
|
||||
mod types;
|
||||
|
||||
pub use document::{Chunk, Doc};
|
||||
pub use error::{LoaderError, LoaderResult};
|
||||
use loader::{
|
||||
get_language_by_filename, DocxLoader, HtmlLoader, LanguageParserOptions, Loader, LoaderError,
|
||||
get_language_by_filename, DocxLoader, HtmlLoader, LanguageParserOptions, Loader,
|
||||
PdfExtractLoader, SourceCodeLoader, TextLoader, Url,
|
||||
};
|
||||
use splitter::{MarkdownSplitter, TextSplitter, TextSplitterError, TokenSplitter};
|
||||
|
||||
Reference in New Issue
Block a user