mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-13 12:55:00 +00:00
feat(native): doc loader for common native (#9941)
This commit is contained in:
70
packages/common/native/src/doc_loader/loader/pdf.rs
Normal file
70
packages/common/native/src/doc_loader/loader/pdf.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
use pdf_extract::{output_doc, output_doc_encrypted, PlainTextOutput};
|
||||
|
||||
/**
|
||||
* modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders
|
||||
*/
|
||||
use super::*;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PdfExtractLoader {
|
||||
document: pdf_extract::Document,
|
||||
}
|
||||
|
||||
impl PdfExtractLoader {
|
||||
pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> {
|
||||
let document = pdf_extract::Document::load_from(reader)
|
||||
.map_err(|e| LoaderError::OtherError(e.to_string()))?;
|
||||
Ok(Self { document })
|
||||
}
|
||||
}
|
||||
|
||||
impl PdfExtractLoader {
|
||||
fn extract_text(&self) -> Result<String, LoaderError> {
|
||||
let mut doc = self.document.clone();
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
let mut output = PlainTextOutput::new(&mut buffer as &mut dyn std::io::Write);
|
||||
if doc.is_encrypted() {
|
||||
output_doc_encrypted(&mut doc, &mut output, "")?;
|
||||
} else {
|
||||
output_doc(&doc, &mut output)?;
|
||||
}
|
||||
Ok(String::from_utf8(buffer)?)
|
||||
}
|
||||
|
||||
fn extract_text_to_doc(&self) -> Result<Document, LoaderError> {
|
||||
let text = self.extract_text()?;
|
||||
Ok(Document::new(text))
|
||||
}
|
||||
}
|
||||
|
||||
impl Loader for PdfExtractLoader {
|
||||
fn load(self) -> Result<Vec<Document>, LoaderError> {
|
||||
let doc = self.extract_text_to_doc()?;
|
||||
Ok(vec![doc])
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{fs::read, io::Cursor, path::PathBuf};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_pdf() {
|
||||
let fixtures = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures");
|
||||
let buffer = read(fixtures.join("sample.pdf")).unwrap();
|
||||
|
||||
let reader = Cursor::new(buffer);
|
||||
let loader = PdfExtractLoader::new(reader).expect("Failed to create PdfExtractLoader");
|
||||
|
||||
let docs = loader.load().unwrap();
|
||||
|
||||
assert_eq!(docs.len(), 1);
|
||||
assert_eq!(
|
||||
&docs[0].page_content[..100],
|
||||
"\n\nSample PDF\nThis is a simple PDF file. Fun fun fun.\n\nLorem ipsum dolor sit amet, \
|
||||
consectetuer a"
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user