use pdf_extract::{output_doc, output_doc_encrypted, PlainTextOutput}; /** * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders */ use super::*; #[derive(Debug, Clone)] pub struct PdfExtractLoader { document: pdf_extract::Document, } impl PdfExtractLoader { pub fn new(reader: R) -> Result { let document = pdf_extract::Document::load_from(reader) .map_err(|e| LoaderError::OtherError(e.to_string()))?; Ok(Self { document }) } } impl PdfExtractLoader { fn extract_text(&self) -> Result { let mut doc = self.document.clone(); let mut buffer: Vec = Vec::new(); let mut output = PlainTextOutput::new(&mut buffer as &mut dyn std::io::Write); if doc.is_encrypted() { output_doc_encrypted(&mut doc, &mut output, "")?; } else { output_doc(&doc, &mut output)?; } Ok(String::from_utf8(buffer)?) } fn extract_text_to_doc(&self) -> Result { let text = self.extract_text()?; Ok(Document::new(text)) } } impl Loader for PdfExtractLoader { fn load(self) -> Result, LoaderError> { let doc = self.extract_text_to_doc()?; Ok(vec![doc]) } } #[cfg(test)] mod tests { use std::{fs::read, io::Cursor, path::PathBuf}; use super::*; #[test] fn test_parse_pdf() { let fixtures = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures"); let buffer = read(fixtures.join("sample.pdf")).unwrap(); let reader = Cursor::new(buffer); let loader = PdfExtractLoader::new(reader).expect("Failed to create PdfExtractLoader"); let docs = loader.load().unwrap(); assert_eq!(docs.len(), 1); assert_eq!( &docs[0].page_content[..100], "\n\nSample PDF\nThis is a simple PDF file. Fun fun fun.\n\nLorem ipsum dolor sit amet, \ consectetuer a" ); } }