diff --git a/Cargo.lock b/Cargo.lock index e1ea9467dc..4349628be8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,17 @@ dependencies = [ "pom", ] +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "affine_common" version = "0.1.0" @@ -477,6 +488,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-padding" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93" +dependencies = [ + "generic-array", +] + [[package]] name = "block2" version = "0.6.0" @@ -518,6 +538,12 @@ version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +[[package]] +name = "bytecount" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" + [[package]] name = "bytemuck" version = "1.22.0" @@ -574,6 +600,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "cbc" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6" +dependencies = [ + "cipher", +] + [[package]] name = "cc" version = "1.2.16" @@ -589,7 +624,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ - "nom", + "nom 7.1.3", ] [[package]] @@ -671,6 +706,16 @@ dependencies = [ "half", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "clang-sys" version = "1.8.1" @@ -1015,15 +1060,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "deranged" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" -dependencies = [ - "powerfmt", -] - [[package]] name = "derive_arbitrary" version = "1.4.1" @@ -1146,6 +1182,15 @@ version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7454e41ff9012c00d53cf7f475c5e3afa3b91b7c90568495495e8d9bf47a1055" +[[package]] +name = "ecb" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7" +dependencies = [ + "cipher", +] + [[package]] name = "either" version = "1.15.0" @@ -1825,6 +1870,16 @@ dependencies = [ "cfb", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "block-padding", + "generic-array", +] + [[package]] name = "io-surface" version = "0.16.0" @@ -2015,19 +2070,27 @@ dependencies = [ [[package]] name = "lopdf" -version = "0.34.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +checksum = "59fa2559e99ba0f26a12458aabc754432c805bbb8cba516c427825a997af1fb7" dependencies = [ + "aes", + "bitflags 2.9.0", + "cbc", + "ecb", "encoding_rs", "flate2", "indexmap", "itoa", "log", "md-5", - "nom", + "nom 8.0.0", + "nom_locate", + "rand 0.9.0", "rangemap", - "time", + "sha2", + "stringprep", + "thiserror 2.0.12", "weezl", ] @@ -2263,6 +2326,26 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom_locate" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d" +dependencies = [ + "bytecount", + "memchr", + "nom 8.0.0", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -2299,12 +2382,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-conv" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" - [[package]] name = "num-integer" version = "0.1.46" @@ -2513,13 +2590,13 @@ checksum = "1e91099d4268b0e11973f036e885d652fb0b21fedcf69738c627f94db6a44f42" [[package]] name = "pdf-extract" version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87aa267a18864f2f75471f6d316ea430f13e78f0b5a882ce261ebbdfd389a76a" +source = "git+https://github.com/toeverything/pdf-extract#49ef7d2aec5bb495467a40082cd9717e849ee29a" dependencies = [ "adobe-cmap-parser", "cff-parser", "encoding_rs", "euclid", + "log", "lopdf", "postscript", "type1-encoding-parser", @@ -2655,12 +2732,6 @@ version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" -[[package]] -name = "powerfmt" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" - [[package]] name = "ppv-lite86" version = "0.2.20" @@ -2949,7 +3020,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f9a866e2e00a7a1fb27e46e9e324a6f7c0e7edc4543cae1d38f4e4a100c610" dependencies = [ "memchr", - "nom", + "nom 7.1.3", "serde", ] @@ -3986,37 +4057,6 @@ dependencies = [ "rustc-hash 1.1.0", ] -[[package]] -name = "time" -version = "0.3.39" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dad298b01a40a23aac4580b67e3dbedb7cc8402f3592d7f49469de2ea4aecdd8" -dependencies = [ - "deranged", - "itoa", - "num-conv", - "powerfmt", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "765c97a5b985b7c11d7bc27fa927dc4fe6af3a6dfb021d28deb60d3bf51e76ef" - -[[package]] -name = "time-macros" -version = "0.2.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8093bc3e81c3bc5f7879de09619d06c9a5a5e45ca44dfeeb7225bae38005c5c" -dependencies = [ - "num-conv", - "time-core", -] - [[package]] name = "tinystr" version = "0.7.6" @@ -4662,7 +4702,7 @@ version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "998d2c24ec099a87daf9467808859f9d82b61f1d9c9701251aea037f514eae0e" dependencies = [ - "nom", + "nom 7.1.3", ] [[package]] @@ -5068,7 +5108,7 @@ dependencies = [ "log", "loom", "nanoid", - "nom", + "nom 7.1.3", "ordered-float", "rand 0.8.5", "rand_chacha 0.3.1", diff --git a/Cargo.toml b/Cargo.toml index 88348fcb81..b8f0cad8c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,7 +41,7 @@ objc2-foundation = "0.3" once_cell = "1" parking_lot = "0.12" path-ext = "0.1.1" -pdf-extract = "0.8.2" +pdf-extract = { git = "https://github.com/toeverything/pdf-extract" } rand = "0.9" rayon = "1.10" readability = { version = "0.3.0", default-features = false } diff --git a/packages/backend/server/src/plugins/copilot/context/job.ts b/packages/backend/server/src/plugins/copilot/context/job.ts index 759d5d9539..c0103010a3 100644 --- a/packages/backend/server/src/plugins/copilot/context/job.ts +++ b/packages/backend/server/src/plugins/copilot/context/job.ts @@ -10,6 +10,7 @@ import { Config, EventBus, JobQueue, + mapAnyError, OnEvent, OnJob, } from '../../../base'; @@ -160,10 +161,11 @@ export class CopilotContextDocJob implements OnModuleInit { chunkSize: total, }); } catch (e: any) { - this.logger.error( - `Failed to embed pending file: ${contextId}::${fileId}`, - e - ); + const error = mapAnyError(e); + error.log('CopilotJob', { + workspaceId, + fileId, + }); this.event.emit('workspace.file.embed.failed', { contextId, diff --git a/packages/common/native/src/doc_loader/document.rs b/packages/common/native/src/doc_loader/document.rs index d6c3166c92..84e29acc51 100644 --- a/packages/common/native/src/doc_loader/document.rs +++ b/packages/common/native/src/doc_loader/document.rs @@ -1,4 +1,8 @@ -use std::{io::Cursor, path::PathBuf}; +use std::{ + io::Cursor, + panic::{catch_unwind, AssertUnwindSafe}, + path::PathBuf, +}; use path_ext::PathExt; @@ -81,16 +85,28 @@ impl Doc { fn from_loader( file_path: &str, - loader: impl Loader, + loader: impl Loader + 'static, splitter: impl TextSplitter + 'static, ) -> Result { let name = file_path.to_string(); - let chunks = Self::get_chunks_from_loader(loader, splitter)?; + let chunks = catch_unwind(AssertUnwindSafe(|| { + Self::get_chunks_from_loader(loader, splitter) + })) + .map_err(|e| { + LoaderError::Other(match e.downcast::() { + Ok(v) => *v, + Err(e) => match e.downcast::<&str>() { + Ok(v) => v.to_string(), + _ => "Unknown Source of Error".to_owned(), + }, + }) + })??; + Ok(Self { name, chunks }) } fn get_chunks_from_loader( - loader: impl Loader, + loader: impl Loader + 'static, splitter: impl TextSplitter + 'static, ) -> Result, LoaderError> { let docs = loader.load_and_split(splitter)?;