diff --git a/Cargo.lock b/Cargo.lock index e440e0ebd2..759793e124 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,15 +17,48 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "adobe-cmap-parser" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3" +dependencies = [ + "pom", +] + [[package]] name = "affine_common" version = "0.1.0" dependencies = [ + "cc", "chrono", "criterion2", + "docx-parser", + "infer", + "path-ext", + "pdf-extract", "rand 0.9.0", "rayon", + "readability", + "serde_json", "sha3", + "strum_macros", + "text-splitter", + "thiserror 1.0.69", + "tiktoken-rs", + "tree-sitter", + "tree-sitter-c", + "tree-sitter-c-sharp", + "tree-sitter-cpp", + "tree-sitter-go", + "tree-sitter-java", + "tree-sitter-javascript", + "tree-sitter-kotlin-ng", + "tree-sitter-python", + "tree-sitter-rust", + "tree-sitter-scala", + "tree-sitter-typescript", + "url", ] [[package]] @@ -287,6 +320,18 @@ dependencies = [ "num-traits", ] +[[package]] +name = "auto_enums" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c170965892137a3a9aeb000b4524aa3cc022a310e709d848b6e1cdce4ab4781" +dependencies = [ + "derive_utils", + "proc-macro2", + "quote", + "syn 2.0.98", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -354,13 +399,13 @@ dependencies = [ "bitflags 2.8.0", "cexpr", "clang-sys", - "itertools", + "itertools 0.13.0", "proc-macro2", "quote", "regex", "rustc-hash 1.1.0", "shlex", - "syn", + "syn 2.0.98", ] [[package]] @@ -523,6 +568,23 @@ dependencies = [ "nom", ] +[[package]] +name = "cfb" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f" +dependencies = [ + "byteorder", + "fnv", + "uuid", +] + +[[package]] +name = "cff-parser" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f5b6e9141c036f3ff4ce7b2f7e432b0f00dee416ddcd4f17741d189ddc2e9d" + [[package]] name = "cfg-if" version = "1.0.0" @@ -627,7 +689,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -658,6 +720,12 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "convert_case" version = "0.7.1" @@ -753,6 +821,15 @@ dependencies = [ "thiserror 2.0.11", ] +[[package]] +name = "core_maths" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77745e017f5edba1a9c1d854f6f3a52dac8a12dd5af5d2f54aecf61e43d80d30" +dependencies = [ + "libm", +] + [[package]] name = "coreaudio-rs" version = "0.12.1" @@ -797,6 +874,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + [[package]] name = "criterion2" version = "2.0.0" @@ -895,6 +981,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", +] + [[package]] name = "derive_arbitrary" version = "1.4.1" @@ -903,7 +998,31 @@ checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", +] + +[[package]] +name = "derive_more" +version = "0.99.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3da29a38df43d6f156149c9b43ded5e018ddff2a855cf2cfd62e8cd7d079c69f" +dependencies = [ + "convert_case 0.4.0", + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.98", +] + +[[package]] +name = "derive_utils" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccfae181bab5ab6c5478b2ccb69e4c68a02f8c3ec72f6616bfec9dbc599d2ee0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", ] [[package]] @@ -944,7 +1063,32 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", +] + +[[package]] +name = "docx-parser" +version = "0.1.1" +source = "git+https://github.com/toeverything/docx-parser#278ba3eeb29bbf1ee7958b02436e4402af61859b" +dependencies = [ + "base64 0.22.1", + "clap", + "docx-rust", + "serde", + "serde_json", +] + +[[package]] +name = "docx-rust" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6a8e9647d314f66d467a705715111d045955949b5dbcf2dc7aee89e078af83d" +dependencies = [ + "derive_more", + "hard-xml", + "log", + "thiserror 1.0.69", + "zip", ] [[package]] @@ -962,6 +1106,15 @@ dependencies = [ "serde", ] +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -989,6 +1142,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "euclid" +version = "0.20.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" +dependencies = [ + "num-traits", +] + [[package]] name = "event-listener" version = "5.4.0" @@ -1034,6 +1196,16 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7ef3d5e8ae27277c8285ac43ed153158178ef0f79567f32024ca8140a0c7cd8" +[[package]] +name = "flate2" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "flume" version = "0.11.1" @@ -1045,6 +1217,12 @@ dependencies = [ "spin", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foldhash" version = "0.1.4" @@ -1069,7 +1247,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -1108,6 +1286,16 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures-channel" version = "0.3.31" @@ -1259,6 +1447,31 @@ dependencies = [ "crunchy", ] +[[package]] +name = "hard-xml" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a344e0cef8802f37dc47f17c01a04354d3e66d9f6c8744108b0912f616efe266" +dependencies = [ + "hard-xml-derive", + "jetscii", + "lazy_static", + "memchr", + "xmlparser", +] + +[[package]] +name = "hard-xml-derive" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfae7cdfe23e50ea96929ccf1948d9ae1d8608353556461e5de247463d3a4f6" +dependencies = [ + "bitflags 2.8.0", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -1340,6 +1553,20 @@ dependencies = [ "windows 0.57.0", ] +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "iana-time-zone" version = "0.1.61" @@ -1478,9 +1705,31 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] +[[package]] +name = "icu_segmenter" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a717725612346ffc2d7b42c94b820db6908048f39434504cb130e8b46256b0de" +dependencies = [ + "core_maths", + "displaydoc", + "icu_collections", + "icu_locid", + "icu_provider", + "icu_segmenter_data", + "utf8_iter", + "zerovec", +] + +[[package]] +name = "icu_segmenter_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f739ee737260d955e330bc83fdeaaf1631f7fb7ed218761d3c04bb13bb7d79df" + [[package]] name = "idna" version = "1.0.3" @@ -1512,6 +1761,15 @@ dependencies = [ "hashbrown 0.15.2", ] +[[package]] +name = "infer" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a588916bfdfd92e71cacef98a63d9b1f0d74d6599980d11894290e7ddefffcf7" +dependencies = [ + "cfb", +] + [[package]] name = "io-surface" version = "0.16.0" @@ -1540,12 +1798,27 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "jetscii" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e" + [[package]] name = "js-sys" version = "0.3.77" @@ -1612,7 +1885,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -1685,6 +1958,30 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "lopdf" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +dependencies = [ + "encoding_rs", + "flate2", + "indexmap", + "itoa", + "log", + "md-5", + "nom", + "rangemap", + "time", + "weezl", +] + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "malloc_buf" version = "0.0.6" @@ -1694,6 +1991,32 @@ dependencies = [ "libc", ] +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1807,11 +2130,11 @@ version = "3.0.0-alpha.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0f0b6f3f77925d8fd2030855af659ce428a7bb6e10e94852e226f509186ba7c" dependencies = [ - "convert_case", + "convert_case 0.7.1", "napi-derive-backend", "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -1820,11 +2143,11 @@ version = "2.0.0-alpha.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c694bb49a2fa84dd9542d51eece39a57519f9cf1fc2deefa9d119ab8181e374d" dependencies = [ - "convert_case", + "convert_case 0.7.1", "proc-macro2", "quote", "semver", - "syn", + "syn 2.0.98", ] [[package]] @@ -1836,6 +2159,12 @@ dependencies = [ "libloading", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "nix" version = "0.29.0" @@ -1894,6 +2223,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-integer" version = "0.1.46" @@ -1924,6 +2259,27 @@ dependencies = [ "libm", ] +[[package]] +name = "num_enum" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" +dependencies = [ + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.98", +] + [[package]] name = "objc" version = "0.2.7" @@ -2062,6 +2418,38 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "path-ext" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de7a86239a8b87b5094977b64893fcf0ed768072744dd4ee0df237686b2d815" +dependencies = [ + "path-slash", + "walkdir", +] + +[[package]] +name = "path-slash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e91099d4268b0e11973f036e885d652fb0b21fedcf69738c627f94db6a44f42" + +[[package]] +name = "pdf-extract" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87aa267a18864f2f75471f6d316ea430f13e78f0b5a882ce261ebbdfd389a76a" +dependencies = [ + "adobe-cmap-parser", + "cff-parser", + "encoding_rs", + "euclid", + "lopdf", + "postscript", + "type1-encoding-parser", + "unicode-normalization", +] + [[package]] name = "pem-rfc7468" version = "0.7.0" @@ -2077,6 +2465,63 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.5", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.5", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher 0.3.11", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher 1.0.1", +] + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -2122,6 +2567,24 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" +[[package]] +name = "pom" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" + +[[package]] +name = "postscript" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.20" @@ -2131,6 +2594,12 @@ dependencies = [ "zerocopy 0.7.35", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "primal-check" version = "0.3.4" @@ -2140,6 +2609,15 @@ dependencies = [ "num-integer", ] +[[package]] +name = "proc-macro-crate" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro2" version = "1.0.93" @@ -2149,6 +2627,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pulldown-cmark" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86ba2052aebccc42cbbb3ed234b8b13ce76f75c3551a303cb2bcffcff12bb14" +dependencies = [ + "bitflags 2.8.0", + "memchr", + "unicase", +] + [[package]] name = "quote" version = "1.0.38" @@ -2235,6 +2724,12 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "rangemap" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684" + [[package]] name = "rayon" version = "1.10.0" @@ -2255,6 +2750,19 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "readability" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e56596e20a6d3cf715182d9b6829220621e6e985cec04d00410cee29821b4220" +dependencies = [ + "html5ever", + "lazy_static", + "markup5ever_rcdom", + "regex", + "url", +] + [[package]] name = "realfft" version = "3.4.0" @@ -2357,7 +2865,7 @@ dependencies = [ "rinja_parser", "rustc-hash 2.1.1", "serde", - "syn", + "syn 2.0.98", ] [[package]] @@ -2421,6 +2929,15 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustfft" version = "6.2.0" @@ -2555,7 +3072,7 @@ checksum = "7f81c2fde025af7e69b1d1420531c8a8811ca898919db177141a85313b1cb932" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -2584,7 +3101,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -2593,6 +3110,7 @@ version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ + "indexmap", "itoa", "memchr", "ryu", @@ -2683,6 +3201,12 @@ version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.9" @@ -2806,7 +3330,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn", + "syn 2.0.98", ] [[package]] @@ -2829,7 +3353,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn", + "syn 2.0.98", "tempfile", "tokio", "url", @@ -2952,12 +3476,43 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + [[package]] name = "strength_reduce" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82" +[[package]] +name = "string_cache" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "938d512196766101d333398efde81bc1f37b00cb42c2f8350e5df639f040bbbe" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "244292f3441c89febe5b5bdfbb6863aeaf4f64da810ea3050fd927b27b8d92ce" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + [[package]] name = "stringprep" version = "0.1.5" @@ -2975,12 +3530,45 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.98", +] + [[package]] name = "subtle" version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.98" @@ -3000,7 +3588,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3023,6 +3611,36 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "text-splitter" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5cb76f2930deed7b89fd345fff5361813fb8feb7b6f0b80d26c4aba391819dd" +dependencies = [ + "ahash", + "auto_enums", + "either", + "icu_provider", + "icu_segmenter", + "itertools 0.14.0", + "pulldown-cmark", + "regex", + "strum", + "thiserror 2.0.11", + "tiktoken-rs", +] + [[package]] name = "textwrap" version = "0.16.1" @@ -3058,7 +3676,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3069,7 +3687,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3098,6 +3716,37 @@ dependencies = [ "rustc-hash 1.1.0", ] +[[package]] +name = "time" +version = "0.3.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + +[[package]] +name = "time-macros" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "tinystr" version = "0.7.6" @@ -3149,7 +3798,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3172,6 +3821,23 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" + +[[package]] +name = "toml_edit" +version = "0.22.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474" +dependencies = [ + "indexmap", + "toml_datetime", + "winnow", +] + [[package]] name = "tracing" version = "0.1.41" @@ -3192,7 +3858,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3244,6 +3910,145 @@ dependencies = [ "strength_reduce", ] +[[package]] +name = "tree-sitter" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a802c93485fb6781d27e27cb5927f6b00ff8d26b56c70af87267be7e99def97" +dependencies = [ + "cc", + "regex", + "regex-syntax 0.8.5", + "serde_json", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-c" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afd2b1bf1585dc2ef6d69e87d01db8adb059006649dd5f96f31aa789ee6e9c71" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-c-sharp" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67f06accca7b45351758663b8215089e643d53bd9a660ce0349314263737fcb0" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-cpp" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2196ea9d47b4ab4a31b9297eaa5a5d19a0b121dceb9f118f6790ad0ab94743" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-go" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b13d476345220dbe600147dd444165c5791bf85ef53e28acbedd46112ee18431" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-java" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa6cbcdc8c679b214e616fd3300da67da0e492e066df01bcf5a5921a71e90d6" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf40bf599e0416c16c125c3cec10ee5ddc7d1bb8b0c60fa5c4de249ad34dc1b1" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-kotlin-ng" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e800ebbda938acfbf224f4d2c34947a31994b1295ee6e819b65226c7b51b4450" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38eee4db33814de3d004de9d8d825627ed3320d0989cce0dea30efaf5be4736c" + +[[package]] +name = "tree-sitter-python" +version = "0.23.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-rust" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4d64d449ca63e683c562c7743946a646671ca23947b9c925c0cfbe65051a4af" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-scala" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efde5e68b4736e9eac17bfa296c6f104a26bffab363b365eb898c40a63c15d2f" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "type1-encoding-parser" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b" +dependencies = [ + "pom", +] + [[package]] name = "typenum" version = "1.17.0" @@ -3360,7 +4165,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22a9dba1d78b9ce429439891089c223478043d52a1c3176a0fcea2b5573a7fcf" dependencies = [ "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3375,7 +4180,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn", + "syn 2.0.98", "toml", "uniffi_meta", ] @@ -3387,7 +4192,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d5965b1d4ffacef1eaa72fef9c00d2491641e87ad910f6c5859b9c503ddb16a" dependencies = [ "anyhow", - "siphasher", + "siphasher 0.3.11", "uniffi_internal_macros", ] @@ -3420,6 +4225,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf16_iter" version = "1.0.5" @@ -3530,7 +4341,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn", + "syn 2.0.98", "wasm-bindgen-shared", ] @@ -3552,7 +4363,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3584,6 +4395,12 @@ dependencies = [ "nom", ] +[[package]] +name = "weezl" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082" + [[package]] name = "whoami" version = "1.5.2" @@ -3693,7 +4510,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3704,7 +4521,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3715,7 +4532,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3726,7 +4543,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -3905,6 +4722,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59690dea168f2198d1a3b0cac23b8063efcd11012f10ae4698f284808c8ef603" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen-rt" version = "0.33.0" @@ -3935,6 +4761,23 @@ dependencies = [ "tap", ] +[[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever", +] + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + [[package]] name = "y-octo" version = "0.0.1" @@ -3980,7 +4823,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", "synstructure", ] @@ -4011,7 +4854,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -4022,7 +4865,7 @@ checksum = "5226bc9a9a9836e7428936cde76bb6b22feea1a8bfdbc0d241136e4d13417e25" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", ] [[package]] @@ -4042,7 +4885,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", "synstructure", ] @@ -4071,5 +4914,21 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.98", +] + +[[package]] +name = "zip" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cc23c04387f4da0374be4533ad1208cbb091d5c11d070dfef13676ad6497164" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "flate2", + "indexmap", + "num_enum", + "thiserror 1.0.69", ] diff --git a/packages/common/native/Cargo.toml b/packages/common/native/Cargo.toml index 5d1ad3e3eb..dd3992ca07 100644 --- a/packages/common/native/Cargo.toml +++ b/packages/common/native/Cargo.toml @@ -3,15 +3,63 @@ edition = "2021" name = "affine_common" version = "0.1.0" +[features] +default = [] +doc-loader = ["docx-parser", "infer", "path-ext", "pdf-extract", "readability", "serde_json", "strum_macros", "text-splitter", "thiserror", "tree-sitter", "url"] +tree-sitter = [ + "cc", + "dep:tree-sitter", + "dep:tree-sitter-c", + "dep:tree-sitter-c-sharp", + "dep:tree-sitter-cpp", + "dep:tree-sitter-go", + "dep:tree-sitter-java", + "dep:tree-sitter-javascript", + "dep:tree-sitter-kotlin-ng", + "dep:tree-sitter-python", + "dep:tree-sitter-rust", + "dep:tree-sitter-scala", + "dep:tree-sitter-typescript", +] + [dependencies] chrono = { workspace = true } rand = { workspace = true } sha3 = { workspace = true } +docx-parser = { git = "https://github.com/toeverything/docx-parser", optional = true } +infer = { version = "0.19.0", optional = true } +path-ext = { version = "0.1.1", optional = true } +pdf-extract = { version = "0.8.2", optional = true } +readability = { version = "0.3.0", optional = true, default-features = false } +serde_json = { version = "1.0", optional = true } +strum_macros = { version = "0.26.2", optional = true } +text-splitter = { version = "0.22", features = ["markdown", "tiktoken-rs"], optional = true } +thiserror = { version = "1", optional = true } +tree-sitter = { version = "0.25", optional = true } +tree-sitter-c = { version = "0.23", optional = true } +tree-sitter-c-sharp = { version = "0.23", optional = true } +tree-sitter-cpp = { version = "0.23", optional = true } +tree-sitter-go = { version = "0.23", optional = true } +tree-sitter-java = { version = "0.23", optional = true } +tree-sitter-javascript = { version = "0.23", optional = true } +tree-sitter-kotlin-ng = { version = "1.1", optional = true } +tree-sitter-python = { version = "0.23", optional = true } +tree-sitter-rust = { version = "0.23", optional = true } +tree-sitter-scala = { version = "0.23", optional = true } +tree-sitter-typescript = { version = "0.23", optional = true } +url = { version = "2.5", optional = true } + + +tiktoken-rs = { workspace = true } + [dev-dependencies] criterion2 = { workspace = true } rayon = { workspace = true } +[build-dependencies] +cc = { version = "1", optional = true } + [[bench]] harness = false name = "hashcash" diff --git a/packages/common/native/fixtures/demo.docx b/packages/common/native/fixtures/demo.docx new file mode 100644 index 0000000000..b172c4395d Binary files /dev/null and b/packages/common/native/fixtures/demo.docx differ diff --git a/packages/common/native/fixtures/demo.docx.0.md b/packages/common/native/fixtures/demo.docx.0.md new file mode 100644 index 0000000000..84eaae7b9e --- /dev/null +++ b/packages/common/native/fixtures/demo.docx.0.md @@ -0,0 +1,28 @@ +# DOCX Demo + +# Demonstration of DOCX support in calibre + +This document demonstrates the ability of the calibre DOCX Input plugin to convert the various typographic features in a Microsoft Word (2007 and newer) document. Convert this document to a modern ebook format, such as AZW3 for Kindles or EPUB for other ebook readers, to see it in action. + +There is support for images, tables, lists, footnotes, endnotes, links, dropcaps and various types of text and paragraph level formatting. + +To see the DOCX conversion in action, simply add this file to calibre using the **“Add Books” **button and then click “**Convert”. ** Set the output format in the top right corner of the conversion dialog to EPUB or AZW3 and click **“OK”**. + +# Text Formatting + +## Inline formatting + +Here, we demonstrate various types of inline text formatting and the use of embedded fonts. + +Here is some **bold, ***italic, ****bold-italic, ***__underlined __and ~~struck out ~~ text. Then, we have a superscript and a subscript. Now we see some red, green and blue text. Some text with a yellow highlight. Some text in a box. Some text in inverse video. + +A paragraph with styled text: subtle emphasis followed by strong text and intense emphasis. This paragraph uses document wide styles for styling rather than inline text properties as demonstrated in the previous paragraph — calibre can handle both with equal ease. + +## Fun with fonts + +This document has embedded the Ubuntu font family. The body text is in the Ubuntu typeface, here is some text in the Ubuntu Mono typeface, notice how every letter has the same width, even i and m. Every embedded font will automatically be embedded in the output ebook during conversion. + +## ***************Paragraph level formatting*** + +You can do crazy things with paragraphs, if the urge strikes you. For instance this paragraph is right aligned and has a right border. It has also been given a light gray background. + diff --git a/packages/common/native/fixtures/demo.docx.1.md b/packages/common/native/fixtures/demo.docx.1.md new file mode 100644 index 0000000000..6fb956d575 --- /dev/null +++ b/packages/common/native/fixtures/demo.docx.1.md @@ -0,0 +1,28 @@ +For the lovers of poetry amongst you, paragraphs with hanging indents, like this often come in handy. You can use hanging indents to ensure that a line of poetry retains its individual identity as a line even when the screen is too narrow to display it as a single line. Not only does this paragraph have a hanging indent, it is also has an extra top margin, setting it apart from the preceding paragraph. + +# Tables + +| | | +| ----------- | -------- | +| ITEM | NEEDED | +| Books | 1 | +| Pens | 3 | +| Pencils | 2 | +| Highlighter | 2 colors | +| Scissors | 1 pair | + +Tables in Word can vary from the extremely simple to the extremely complex. calibre tries to do its best when converting tables. While you may run into trouble with the occasional table, the vast majority of common cases should be converted very well, as demonstrated in this section. Note that for optimum results, when creating tables in Word, you should set their widths using percentages, rather than absolute units. To the left of this paragraph is a floating two column table with a nice green border and header row. + +Now let’s look at a fancier table—one with alternating row colors and partial borders. This table is stretched out to take 100% of the available width. + +| | | | | | | +| ------------ | ------- | ------- | ------- | ------- | ------- | +| City or Town | Point A | Point B | Point C | Point D | Point E | +| Point A | — | | | | | +| Point B | 87 | — | | | | +| Point C | 64 | 56 | — | | | +| Point D | 37 | 32 | 91 | — | | +| Point E | 93 | 35 | 54 | 43 | — | + +Next, we see a table with special formatting in various locations. Notice how the formatting for the header row and sub header rows is preserved. + diff --git a/packages/common/native/fixtures/demo.docx.2.md b/packages/common/native/fixtures/demo.docx.2.md new file mode 100644 index 0000000000..c33981298c --- /dev/null +++ b/packages/common/native/fixtures/demo.docx.2.md @@ -0,0 +1,21 @@ +| | | | | +| ---------------- | ------------- | ------------------- | ------ | +| College | New students | Graduating students | Change | +| | Undergraduate | | | +| Cedar University | 110 | 103 | +7 | +| Oak Institute | 202 | 210 | -8 | +| | Graduate | | | +| Cedar University | 24 | 20 | +4 | +| Elm College | 43 | 53 | -10 | +| Total | 998 | 908 | 90 | + +Source: Fictitious data, for illustration purposes only + +Next, we have something a little more complex, a nested table, i.e. a table inside another table. Additionally, the inner table has some of its cells merged. The table is displayed horizontally centered. + +| | | +| --- | -------------------------------------------------------------- | +| | To the left is a table inside a table, with some cells merged. | + +We end with a fancy calendar, note how much of the original formatting is preserved. Note that this table will only display correctly on relatively wide screens. In general, very wide tables or tables whose cells have fixed width requirements don’t fare well in ebooks. + diff --git a/packages/common/native/fixtures/demo.docx.3.md b/packages/common/native/fixtures/demo.docx.3.md new file mode 100644 index 0000000000..2ba6242d7d --- /dev/null +++ b/packages/common/native/fixtures/demo.docx.3.md @@ -0,0 +1,18 @@ +| | | | | | | | | | | | | | +| ------------- | | --- | | --- | | --- | | --- | | --- | | --- | +| December 2007 | | | | | | | | | | | | | +| Sun | | Mon | | Tue | | Wed | | Thu | | Fri | | Sat | +| | | | | | | | | | | | | 1 | +| | | | | | | | | | | | | | +| 2 | | 3 | | 4 | | 5 | | 6 | | 7 | | 8 | +| | | | | | | | | | | | | | +| 9 | | 10 | | 11 | | 12 | | 13 | | 14 | | 15 | +| | | | | | | | | | | | | | +| 16 | | 17 | | 18 | | 19 | | 20 | | 21 | | 22 | +| | | | | | | | | | | | | | +| 23 | | 24 | | 25 | | 26 | | 27 | | 28 | | 29 | +| | | | | | | | | | | | | | +| 30 | | 31 | | | | | | | | | | | + +# Structural Elements + diff --git a/packages/common/native/fixtures/demo.docx.4.md b/packages/common/native/fixtures/demo.docx.4.md new file mode 100644 index 0000000000..b5ff3a8ddf --- /dev/null +++ b/packages/common/native/fixtures/demo.docx.4.md @@ -0,0 +1,20 @@ +Miscellaneous structural elements you can add to your document, like footnotes, endnotes, dropcaps and the like. + +## Footnotes & Endnotes + +Footnotes and endnotes are automatically recognized and both are converted to endnotes, with backlinks for maximum ease of use in ebook devices. + +## Dropcaps + +D + +rop caps are used to emphasize the leading paragraph at the start of a section. In Word it is possible to specify how many lines of text a drop-cap should use. Because of limitations in ebook technology, this is not possible when converting. Instead, the converted drop cap will use font size and line height to simulate the effect as well as possible. While not as good as the original, the result is usually tolerable. This paragraph has a “D” dropcap set to occupy three lines of text with a font size of 58.5 pts. Depending on the screen width and capabilities of the device you view the book on, this dropcap can look anything from perfect to ugly. + +## Links + +Two kinds of links are possible, those that refer to an external website and those that refer to locations inside the document itself. Both are supported by calibre. For example, here is a link pointing to the [calibre download page](http://calibre-ebook.com/download). Then we have a link that points back to the section on [paragraph level formatting](#_Paragraph_level_formatting) in this document. + +## Table of Contents + +There are two approaches that calibre takes when generating a Table of Contents. The first is if the Word document has a Table of Contents itself. Provided that the Table of Contents uses hyperlinks, calibre will automatically use it. The levels of the Table of Contents are identified by their left indent, so if you want the ebook to have a multi-level Table of Contents, make sure you create a properly indented Table of Contents in Word. + diff --git a/packages/common/native/fixtures/demo.docx.5.md b/packages/common/native/fixtures/demo.docx.5.md new file mode 100644 index 0000000000..75081fee8a --- /dev/null +++ b/packages/common/native/fixtures/demo.docx.5.md @@ -0,0 +1,30 @@ +If no Table of Contents is found in the document, then a table of contents is automatically generated from the headings in the document. A heading is identified as something that has the Heading 1 or Heading 2, etc. style applied to it. These headings are turned into a Table of Contents with Heading 1 being the topmost level, Heading 2 the second level and so on. + + You can see the Table of Contents created by calibre by clicking the Table of Contents button in whatever viewer you are using to view the converted ebook. + +# Images + +Images can be of three main types. Inline images are images that are part of the normal text flow, like this image of a green dot ![dot_green.png](./media/image2.png). Inline images do not cause breaks in the text and are usually small in size. The next category of image is a floating image, one that “floats “ on the page and is surrounded by text. Word supports more types of floating images than are possible with current ebook technology, so the conversion maps floating images to simple left and right floats, as you can see with the left and right arrow images on the sides of this paragraph. + +The final type of image is a “block” image, one that becomes a paragraph on its own and has no text on either side. Below is a centered green dot. + +Centered images like this are useful for large pictures that should be a focus of attention. + +Generally, it is not possible to translate the exact positioning of images from a Word document to an ebook. That is because in Word, image positioning is specified in absolute units from the page boundaries. There is no analogous technology in ebooks, so the conversion will usually end up placing the image either centered or floating close to the point in the text where it was inserted, not necessarily where it appears on the page in Word. + +# Lists + +All types of lists are supported by the conversion, with the exception of lists that use fancy bullets, these get converted to regular bullets. + +## Bulleted List + +- One + +- Two + +## Numbered List + +1. One, with a very long line to demonstrate that the hanging indent for the list is working correctly + +2. Two + diff --git a/packages/common/native/fixtures/demo.docx.6.md b/packages/common/native/fixtures/demo.docx.6.md new file mode 100644 index 0000000000..8c31cfaee1 --- /dev/null +++ b/packages/common/native/fixtures/demo.docx.6.md @@ -0,0 +1,37 @@ +## Multi-level Lists + +1. One + + 2. Two + + 3. Three + + 4. Four with a very long line to demonstrate that the hanging indent for the list is working correctly. + + 5. Five + +6. Six + +A Multi-level list with bullets: + +- One + + - Two + + - This bullet uses an image as the bullet item + + - Four + +- Five + +## Continued Lists + +i. One + +j. Two + +An interruption in our regularly scheduled listing, for this essential and very relevant public service announcement. + +k. We now resume our normal programming + +l. Four diff --git a/packages/common/native/fixtures/demo.docx.md b/packages/common/native/fixtures/demo.docx.md new file mode 100644 index 0000000000..ed9a5de54c --- /dev/null +++ b/packages/common/native/fixtures/demo.docx.md @@ -0,0 +1,182 @@ +# DOCX Demo + +# Demonstration of DOCX support in calibre + +This document demonstrates the ability of the calibre DOCX Input plugin to convert the various typographic features in a Microsoft Word (2007 and newer) document. Convert this document to a modern ebook format, such as AZW3 for Kindles or EPUB for other ebook readers, to see it in action. + +There is support for images, tables, lists, footnotes, endnotes, links, dropcaps and various types of text and paragraph level formatting. + +To see the DOCX conversion in action, simply add this file to calibre using the **“Add Books” **button and then click “**Convert”. ** Set the output format in the top right corner of the conversion dialog to EPUB or AZW3 and click **“OK”**. + +# Text Formatting + +## Inline formatting + +Here, we demonstrate various types of inline text formatting and the use of embedded fonts. + +Here is some **bold, ***italic, ****bold-italic, ***__underlined __and ~~struck out ~~ text. Then, we have a superscript and a subscript. Now we see some red, green and blue text. Some text with a yellow highlight. Some text in a box. Some text in inverse video. + +A paragraph with styled text: subtle emphasis followed by strong text and intense emphasis. This paragraph uses document wide styles for styling rather than inline text properties as demonstrated in the previous paragraph — calibre can handle both with equal ease. + +## Fun with fonts + +This document has embedded the Ubuntu font family. The body text is in the Ubuntu typeface, here is some text in the Ubuntu Mono typeface, notice how every letter has the same width, even i and m. Every embedded font will automatically be embedded in the output ebook during conversion. + +## ***************Paragraph level formatting*** + +You can do crazy things with paragraphs, if the urge strikes you. For instance this paragraph is right aligned and has a right border. It has also been given a light gray background. + +For the lovers of poetry amongst you, paragraphs with hanging indents, like this often come in handy. You can use hanging indents to ensure that a line of poetry retains its individual identity as a line even when the screen is too narrow to display it as a single line. Not only does this paragraph have a hanging indent, it is also has an extra top margin, setting it apart from the preceding paragraph. + +# Tables + +| | | +| ----------- | -------- | +| ITEM | NEEDED | +| Books | 1 | +| Pens | 3 | +| Pencils | 2 | +| Highlighter | 2 colors | +| Scissors | 1 pair | + +Tables in Word can vary from the extremely simple to the extremely complex. calibre tries to do its best when converting tables. While you may run into trouble with the occasional table, the vast majority of common cases should be converted very well, as demonstrated in this section. Note that for optimum results, when creating tables in Word, you should set their widths using percentages, rather than absolute units. To the left of this paragraph is a floating two column table with a nice green border and header row. + +Now let’s look at a fancier table—one with alternating row colors and partial borders. This table is stretched out to take 100% of the available width. + +| | | | | | | +| ------------ | ------- | ------- | ------- | ------- | ------- | +| City or Town | Point A | Point B | Point C | Point D | Point E | +| Point A | — | | | | | +| Point B | 87 | — | | | | +| Point C | 64 | 56 | — | | | +| Point D | 37 | 32 | 91 | — | | +| Point E | 93 | 35 | 54 | 43 | — | + +Next, we see a table with special formatting in various locations. Notice how the formatting for the header row and sub header rows is preserved. + +| | | | | +| ---------------- | ------------- | ------------------- | ------ | +| College | New students | Graduating students | Change | +| | Undergraduate | | | +| Cedar University | 110 | 103 | +7 | +| Oak Institute | 202 | 210 | -8 | +| | Graduate | | | +| Cedar University | 24 | 20 | +4 | +| Elm College | 43 | 53 | -10 | +| Total | 998 | 908 | 90 | + +Source: Fictitious data, for illustration purposes only + +Next, we have something a little more complex, a nested table, i.e. a table inside another table. Additionally, the inner table has some of its cells merged. The table is displayed horizontally centered. + +| | | +| --- | -------------------------------------------------------------- | +| | To the left is a table inside a table, with some cells merged. | + +We end with a fancy calendar, note how much of the original formatting is preserved. Note that this table will only display correctly on relatively wide screens. In general, very wide tables or tables whose cells have fixed width requirements don’t fare well in ebooks. + +| | | | | | | | | | | | | | +| ------------- | | --- | | --- | | --- | | --- | | --- | | --- | +| December 2007 | | | | | | | | | | | | | +| Sun | | Mon | | Tue | | Wed | | Thu | | Fri | | Sat | +| | | | | | | | | | | | | 1 | +| | | | | | | | | | | | | | +| 2 | | 3 | | 4 | | 5 | | 6 | | 7 | | 8 | +| | | | | | | | | | | | | | +| 9 | | 10 | | 11 | | 12 | | 13 | | 14 | | 15 | +| | | | | | | | | | | | | | +| 16 | | 17 | | 18 | | 19 | | 20 | | 21 | | 22 | +| | | | | | | | | | | | | | +| 23 | | 24 | | 25 | | 26 | | 27 | | 28 | | 29 | +| | | | | | | | | | | | | | +| 30 | | 31 | | | | | | | | | | | + +# Structural Elements + +Miscellaneous structural elements you can add to your document, like footnotes, endnotes, dropcaps and the like. + +## Footnotes & Endnotes + +Footnotes and endnotes are automatically recognized and both are converted to endnotes, with backlinks for maximum ease of use in ebook devices. + +## Dropcaps + +D + +rop caps are used to emphasize the leading paragraph at the start of a section. In Word it is possible to specify how many lines of text a drop-cap should use. Because of limitations in ebook technology, this is not possible when converting. Instead, the converted drop cap will use font size and line height to simulate the effect as well as possible. While not as good as the original, the result is usually tolerable. This paragraph has a “D” dropcap set to occupy three lines of text with a font size of 58.5 pts. Depending on the screen width and capabilities of the device you view the book on, this dropcap can look anything from perfect to ugly. + +## Links + +Two kinds of links are possible, those that refer to an external website and those that refer to locations inside the document itself. Both are supported by calibre. For example, here is a link pointing to the [calibre download page](http://calibre-ebook.com/download). Then we have a link that points back to the section on [paragraph level formatting](#_Paragraph_level_formatting) in this document. + +## Table of Contents + +There are two approaches that calibre takes when generating a Table of Contents. The first is if the Word document has a Table of Contents itself. Provided that the Table of Contents uses hyperlinks, calibre will automatically use it. The levels of the Table of Contents are identified by their left indent, so if you want the ebook to have a multi-level Table of Contents, make sure you create a properly indented Table of Contents in Word. + +If no Table of Contents is found in the document, then a table of contents is automatically generated from the headings in the document. A heading is identified as something that has the Heading 1 or Heading 2, etc. style applied to it. These headings are turned into a Table of Contents with Heading 1 being the topmost level, Heading 2 the second level and so on. + + You can see the Table of Contents created by calibre by clicking the Table of Contents button in whatever viewer you are using to view the converted ebook. + +# Images + +Images can be of three main types. Inline images are images that are part of the normal text flow, like this image of a green dot ![dot_green.png](./media/image2.png). Inline images do not cause breaks in the text and are usually small in size. The next category of image is a floating image, one that “floats “ on the page and is surrounded by text. Word supports more types of floating images than are possible with current ebook technology, so the conversion maps floating images to simple left and right floats, as you can see with the left and right arrow images on the sides of this paragraph. + +The final type of image is a “block” image, one that becomes a paragraph on its own and has no text on either side. Below is a centered green dot. + +Centered images like this are useful for large pictures that should be a focus of attention. + +Generally, it is not possible to translate the exact positioning of images from a Word document to an ebook. That is because in Word, image positioning is specified in absolute units from the page boundaries. There is no analogous technology in ebooks, so the conversion will usually end up placing the image either centered or floating close to the point in the text where it was inserted, not necessarily where it appears on the page in Word. + +# Lists + +All types of lists are supported by the conversion, with the exception of lists that use fancy bullets, these get converted to regular bullets. + +## Bulleted List + +- One + +- Two + +## Numbered List + +1. One, with a very long line to demonstrate that the hanging indent for the list is working correctly + +2. Two + +## Multi-level Lists + +1. One + + 2. Two + + 3. Three + + 4. Four with a very long line to demonstrate that the hanging indent for the list is working correctly. + + 5. Five + +6. Six + +A Multi-level list with bullets: + +- One + + - Two + + - This bullet uses an image as the bullet item + + - Four + +- Five + +## Continued Lists + +i. One + +j. Two + +An interruption in our regularly scheduled listing, for this essential and very relevant public service announcement. + +k. We now resume our normal programming + +l. Four diff --git a/packages/common/native/fixtures/sample.c b/packages/common/native/fixtures/sample.c new file mode 100644 index 0000000000..77c595c915 --- /dev/null +++ b/packages/common/native/fixtures/sample.c @@ -0,0 +1,7 @@ + +#include + +int main() { + printf("Hello, World!\n"); + return 0; +} \ No newline at end of file diff --git a/packages/common/native/fixtures/sample.c.0.md b/packages/common/native/fixtures/sample.c.0.md new file mode 100644 index 0000000000..53c5fdf179 --- /dev/null +++ b/packages/common/native/fixtures/sample.c.0.md @@ -0,0 +1 @@ +#include diff --git a/packages/common/native/fixtures/sample.c.1.md b/packages/common/native/fixtures/sample.c.1.md new file mode 100644 index 0000000000..ff6b43ab59 --- /dev/null +++ b/packages/common/native/fixtures/sample.c.1.md @@ -0,0 +1,4 @@ +int main() { + printf("Hello, World!\n"); + return 0; +} \ No newline at end of file diff --git a/packages/common/native/fixtures/sample.html b/packages/common/native/fixtures/sample.html new file mode 100644 index 0000000000..1a8b38f01d --- /dev/null +++ b/packages/common/native/fixtures/sample.html @@ -0,0 +1,481 @@ + + + + + + Example Domain + + + + + + +
+

Example Domain

+

+ This domain is for use in illustrative examples in documents. You may + use this domain in literature without prior coordination or asking for + permission. +

+

+ More information... +

+
+ + + diff --git a/packages/common/native/fixtures/sample.html.0.md b/packages/common/native/fixtures/sample.html.0.md new file mode 100644 index 0000000000..b722328f91 --- /dev/null +++ b/packages/common/native/fixtures/sample.html.0.md @@ -0,0 +1,6 @@ +Example Domain + + This domain is for use in illustrative examples in documents. You may + use this domain in literature without prior coordination or asking for + permission. + More information... \ No newline at end of file diff --git a/packages/common/native/fixtures/sample.pdf b/packages/common/native/fixtures/sample.pdf new file mode 100644 index 0000000000..c01805e89c Binary files /dev/null and b/packages/common/native/fixtures/sample.pdf differ diff --git a/packages/common/native/fixtures/sample.pdf.0.md b/packages/common/native/fixtures/sample.pdf.0.md new file mode 100644 index 0000000000..f35c3d5d5e --- /dev/null +++ b/packages/common/native/fixtures/sample.pdf.0.md @@ -0,0 +1,17 @@ + + +Sample PDF +This is a simple PDF file. Fun fun fun. + +Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Phasellus facilisis odio sed mi. +Curabitur suscipit. Nullam vel nisi. Etiam semper ipsum ut lectus. Proin aliquam, erat eget +pharetra commodo, eros mi condimentum quam, sed commodo justo quam ut velit. +Integer a erat. Cras laoreet ligula cursus enim. Aenean scelerisque velit et tellus. +Vestibulum dictum aliquet sem. Nulla facilisi. Vestibulum accumsan ante vitae elit. Nulla +erat dolor, blandit in, rutrum quis, semper pulvinar, enim. Nullam varius congue risus. +Vivamus sollicitudin, metus ut interdum eleifend, nisi tellus pellentesque elit, tristique +accumsan eros quam et risus. Suspendisse libero odio, mattis sit amet, aliquet eget, +hendrerit vel, nulla. Sed vitae augue. Aliquam erat volutpat. Aliquam feugiat vulputate nisl. +Suspendisse quis nulla pretium ante pretium mollis. Proin velit ligula, sagittis at, egestas a, +pulvinar quis, nisl. + diff --git a/packages/common/native/fixtures/sample.pdf.1.md b/packages/common/native/fixtures/sample.pdf.1.md new file mode 100644 index 0000000000..3bb84cb162 --- /dev/null +++ b/packages/common/native/fixtures/sample.pdf.1.md @@ -0,0 +1,9 @@ +Pellentesque sit amet lectus. Praesent pulvinar, nunc quis iaculis sagittis, justo quam +lobortis tortor, sed vestibulum dui metus venenatis est. Nunc cursus ligula. Nulla facilisi. +Phasellus ullamcorper consectetuer ante. Duis tincidunt, urna id condimentum luctus, nibh +ante vulputate sapien, id sagittis massa orci ut enim. Pellentesque vestibulum convallis +sem. Nulla consequat quam ut nisl. Nullam est. Curabitur tincidunt dapibus lorem. Proin +velit turpis, scelerisque sit amet, iaculis nec, rhoncus ac, ipsum. Phasellus lorem arcu, +feugiat eu, gravida eu, consequat molestie, ipsum. Nullam vel est ut ipsum volutpat +feugiat. Aenean pellentesque. + diff --git a/packages/common/native/fixtures/sample.pdf.2.md b/packages/common/native/fixtures/sample.pdf.2.md new file mode 100644 index 0000000000..bc1e6a2638 --- /dev/null +++ b/packages/common/native/fixtures/sample.pdf.2.md @@ -0,0 +1,16 @@ +In mauris. Pellentesque dui nisi, iaculis eu, rhoncus in, venenatis ac, ante. Ut odio justo, +scelerisque vel, facilisis non, commodo a, pede. Cras nec massa sit amet tortor volutpat +varius. Donec lacinia, neque a luctus aliquet, pede massa imperdiet ante, at varius lorem +pede sed sapien. Fusce erat nibh, aliquet in, eleifend eget, commodo eget, erat. Fusce +consectetuer. Cras risus tortor, porttitor nec, tristique sed, convallis semper, eros. Fusce +vulputate ipsum a mauris. Phasellus mollis. Curabitur sed urna. Aliquam nec sapien non +nibh pulvinar convallis. Vivamus facilisis augue quis quam. Proin cursus aliquet metus. +Suspendisse lacinia. Nulla at tellus ac turpis eleifend scelerisque. Maecenas a pede vitae +enim commodo interdum. Donec odio. Sed sollicitudin dui vitae justo. + +Morbi elit nunc, facilisis a, mollis a, molestie at, lectus. Suspendisse eget mauris eu tellus +molestie cursus. Duis ut magna at justo dignissim condimentum. Cum sociis natoque +penatibus et magnis dis parturient montes, nascetur ridiculus mus. Vivamus varius. Ut sit +amet diam suscipit mauris ornare aliquam. Sed varius. Duis arcu. Etiam tristique massa +eget dui. Phasellus congue. Aenean est erat, tincidunt eget, venenatis quis, commodo at, +quam. \ No newline at end of file diff --git a/packages/common/native/fixtures/sample.rs b/packages/common/native/fixtures/sample.rs new file mode 100644 index 0000000000..20c7697ff9 --- /dev/null +++ b/packages/common/native/fixtures/sample.rs @@ -0,0 +1,10 @@ +fn factorial(n: u64) -> u64 { + if n == 0 { + return 1; + } + n * factorial(n - 1) +} + +fn main() { + println!("Hello, world!"); +} diff --git a/packages/common/native/fixtures/sample.rs.0.md b/packages/common/native/fixtures/sample.rs.0.md new file mode 100644 index 0000000000..52e6433529 --- /dev/null +++ b/packages/common/native/fixtures/sample.rs.0.md @@ -0,0 +1,6 @@ +fn factorial(n: u64) -> u64 { + if n == 0 { + return 1; + } + n * factorial(n - 1) +} \ No newline at end of file diff --git a/packages/common/native/fixtures/sample.rs.1.md b/packages/common/native/fixtures/sample.rs.1.md new file mode 100644 index 0000000000..0309b94864 --- /dev/null +++ b/packages/common/native/fixtures/sample.rs.1.md @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} \ No newline at end of file diff --git a/packages/common/native/fixtures/sample.ts b/packages/common/native/fixtures/sample.ts new file mode 100644 index 0000000000..0186fe8f8c --- /dev/null +++ b/packages/common/native/fixtures/sample.ts @@ -0,0 +1,3 @@ +export default function sample() { + return 'sample'; +} diff --git a/packages/common/native/fixtures/sample.ts.0.md b/packages/common/native/fixtures/sample.ts.0.md new file mode 100644 index 0000000000..5887c490aa --- /dev/null +++ b/packages/common/native/fixtures/sample.ts.0.md @@ -0,0 +1,3 @@ +export default function sample() { + return 'sample'; +} \ No newline at end of file diff --git a/packages/common/native/src/doc_loader/document.rs b/packages/common/native/src/doc_loader/document.rs new file mode 100644 index 0000000000..881767d4b9 --- /dev/null +++ b/packages/common/native/src/doc_loader/document.rs @@ -0,0 +1,169 @@ +use std::{io::Cursor, path::PathBuf}; + +use path_ext::PathExt; + +use super::*; + +#[derive(Clone, Default)] +pub struct Chunk { + pub index: usize, + pub content: String, + pub start: Option, + pub end: Option, +} + +pub struct DocOptions { + code_threshold: u64, +} + +impl Default for DocOptions { + fn default() -> Self { + Self { + code_threshold: 1000, + } + } +} + +pub struct Doc { + pub name: String, + pub chunks: Vec, +} + +impl Doc { + pub fn new(file_path: &str, doc: &[u8]) -> Option { + Self::with_options(file_path, doc, DocOptions::default()) + } + + pub fn with_options(file_path: &str, doc: &[u8], options: DocOptions) -> Option { + if let Some(kind) = + infer::get(&doc[..4096.min(doc.len())]).or(infer::get_from_path(file_path).ok().flatten()) + { + if kind.extension() == "pdf" { + return Self::load_pdf(file_path, doc); + } else if kind.extension() == "docx" { + return Self::load_docx(file_path, doc); + } else if kind.extension() == "html" { + return Self::load_html(file_path, doc); + } + } else if let Ok(string) = String::from_utf8(doc.to_vec()).or_else(|_| { + String::from_utf16( + &doc + .chunks_exact(2) + .map(|b| u16::from_le_bytes([b[0], b[1]])) + .collect::>(), + ) + }) { + let path = PathBuf::from(file_path); + match path.ext_str() { + "md" => { + let loader = TextLoader::new(string); + let splitter = MarkdownSplitter::default(); + return Self::from_loader(file_path, loader, splitter).ok(); + } + "rs" | "c" | "cpp" | "h" | "hpp" | "js" | "ts" | "tsx" | "go" | "py" => { + let name = path.full_str().to_string(); + let loader = + SourceCodeLoader::from_string(string).with_parser_option(LanguageParserOptions { + language: get_language_by_filename(&name).ok()?, + parser_threshold: options.code_threshold, + }); + let splitter = TokenSplitter::default(); + return Self::from_loader(file_path, loader, splitter).ok(); + } + _ => {} + } + let loader = TextLoader::new(string); + let splitter = TokenSplitter::default(); + return Self::from_loader(file_path, loader, splitter).ok(); + } + None + } + + fn from_loader( + file_path: &str, + loader: impl Loader, + splitter: impl TextSplitter + 'static, + ) -> Result { + let name = file_path.to_string(); + let chunks = Self::get_chunks_from_loader(loader, splitter)?; + Ok(Self { name, chunks }) + } + + fn get_chunks_from_loader( + loader: impl Loader, + splitter: impl TextSplitter + 'static, + ) -> Result, LoaderError> { + let docs = loader.load_and_split(splitter)?; + Ok( + docs + .into_iter() + .enumerate() + .map(|(index, d)| Chunk { + index, + content: d.page_content, + ..Chunk::default() + }) + .collect(), + ) + } + + fn load_docx(file_path: &str, doc: &[u8]) -> Option { + let loader = DocxLoader::new(Cursor::new(doc))?; + let splitter = TokenSplitter::default(); + Self::from_loader(file_path, loader, splitter).ok() + } + + fn load_html(file_path: &str, doc: &[u8]) -> Option { + let loader = HtmlLoader::from_string( + String::from_utf8(doc.to_vec()).ok()?, + Url::parse(file_path) + .or(Url::parse("https://example.com/")) + .ok()?, + ); + let splitter = TokenSplitter::default(); + Self::from_loader(file_path, loader, splitter).ok() + } + + fn load_pdf(file_path: &str, doc: &[u8]) -> Option { + let loader = PdfExtractLoader::new(Cursor::new(doc)).ok()?; + let splitter = TokenSplitter::default(); + Self::from_loader(file_path, loader, splitter).ok() + } +} + +#[cfg(test)] +mod tests { + use std::{ + fs::{read, read_to_string}, + path::PathBuf, + }; + + use super::*; + + const FIXTURES: [&str; 6] = [ + "demo.docx", + "sample.pdf", + "sample.html", + "sample.rs", + "sample.c", + "sample.ts", + ]; + + fn get_fixtures() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures") + } + + #[test] + fn test_fixtures() { + let fixtures = get_fixtures(); + for fixture in FIXTURES.iter() { + let buffer = read(fixtures.join(fixture)).unwrap(); + let doc = Doc::with_options(fixture, &buffer, DocOptions { code_threshold: 0 }).unwrap(); + for chunk in doc.chunks.iter() { + let output = + read_to_string(fixtures.join(format!("{}.{}.md", fixture, chunk.index))).unwrap(); + assert_eq!(chunk.content, output); + } + } + } +} diff --git a/packages/common/native/src/doc_loader/loader/docx.rs b/packages/common/native/src/doc_loader/loader/docx.rs new file mode 100644 index 0000000000..1b989ff227 --- /dev/null +++ b/packages/common/native/src/doc_loader/loader/docx.rs @@ -0,0 +1,71 @@ +use docx_parser::MarkdownDocument; + +use super::*; + +#[derive(Debug)] +pub struct DocxLoader { + document: MarkdownDocument, +} + +impl DocxLoader { + pub fn new(reader: R) -> Option { + Some(Self { + document: MarkdownDocument::from_reader(reader)?, + }) + } + + fn extract_text(&self) -> String { + self.document.to_markdown(false) + } + + fn extract_text_to_doc(&self) -> Document { + Document::new(self.extract_text()) + } +} + +impl Loader for DocxLoader { + fn load(self) -> Result, LoaderError> { + let doc = self.extract_text_to_doc(); + Ok(vec![doc]) + } +} + +#[cfg(test)] +mod tests { + use std::{fs::read, io::Cursor, path::PathBuf}; + + use super::*; + + fn get_fixtures_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures") + } + + #[test] + fn test_parse_docx() { + let docx_buffer = include_bytes!("../../../fixtures/demo.docx"); + let parsed_buffer = include_str!("../../../fixtures/demo.docx.md"); + + { + let loader = DocxLoader::new(Cursor::new(docx_buffer)).unwrap(); + + let documents = loader.load().unwrap(); + + assert_eq!(documents.len(), 1); + assert_eq!(documents[0].page_content, parsed_buffer); + } + + { + let loader = DocxLoader::new(Cursor::new(docx_buffer)).unwrap(); + let documents = loader.load_and_split(TokenSplitter::default()).unwrap(); + + for (idx, doc) in documents.into_iter().enumerate() { + assert_eq!( + doc.page_content, + String::from_utf8_lossy( + &read(get_fixtures_path().join(format!("demo.docx.{}.md", idx))).unwrap() + ) + ); + } + } + } +} diff --git a/packages/common/native/src/doc_loader/loader/error.rs b/packages/common/native/src/doc_loader/loader/error.rs new file mode 100644 index 0000000000..94ded5ece9 --- /dev/null +++ b/packages/common/native/src/doc_loader/loader/error.rs @@ -0,0 +1,42 @@ +use std::{io, str::Utf8Error, string::FromUtf8Error}; + +use thiserror::Error; + +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders + */ +use super::*; + +#[derive(Error, Debug)] +pub enum LoaderError { + #[error("{0}")] + TextSplitterError(#[from] TextSplitterError), + + #[error(transparent)] + IOError(#[from] io::Error), + + #[error(transparent)] + Utf8Error(#[from] Utf8Error), + + #[error(transparent)] + FromUtf8Error(#[from] FromUtf8Error), + + #[cfg(feature = "pdf-extract")] + #[error(transparent)] + PdfExtractError(#[from] pdf_extract::Error), + + #[cfg(feature = "pdf-extract")] + #[error(transparent)] + PdfExtractOutputError(#[from] pdf_extract::OutputError), + + #[error(transparent)] + ReadabilityError(#[from] readability::error::Error), + + #[error("Unsupported source language")] + UnsupportedLanguage, + + #[error("Error: {0}")] + OtherError(String), +} + +pub type LoaderResult = Result; diff --git a/packages/common/native/src/doc_loader/loader/html.rs b/packages/common/native/src/doc_loader/loader/html.rs new file mode 100644 index 0000000000..347e8a9308 --- /dev/null +++ b/packages/common/native/src/doc_loader/loader/html.rs @@ -0,0 +1,87 @@ +use std::{collections::HashMap, io::Cursor}; + +use serde_json::Value; + +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders + */ +use super::*; +#[derive(Debug, Clone)] +pub struct HtmlLoader { + html: R, + url: Url, +} + +impl HtmlLoader>> { + pub fn from_string>(input: S, url: Url) -> Self { + let input = input.into(); + let reader = Cursor::new(input.into_bytes()); + Self::new(reader, url) + } +} + +impl HtmlLoader { + pub fn new(html: R, url: Url) -> Self { + Self { html, url } + } +} + +impl Loader for HtmlLoader { + fn load(mut self) -> Result, LoaderError> { + let cleaned_html = readability::extractor::extract(&mut self.html, &self.url)?; + let doc = + Document::new(format!("{}\n{}", cleaned_html.title, cleaned_html.text)).with_metadata( + HashMap::from([("source".to_string(), Value::from(self.url.as_str()))]), + ); + + Ok(vec![doc]) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_html_loader() { + let input = "

Hello world!

"; + + let html_loader = HtmlLoader::new( + input.as_bytes(), + Url::parse("https://example.com/").unwrap(), + ); + + let documents = html_loader.load().unwrap(); + + let expected = "\nHello world!"; + + assert_eq!(documents.len(), 1); + assert_eq!( + documents[0].metadata.get("source").unwrap(), + &Value::from("https://example.com/") + ); + assert_eq!(documents[0].page_content, expected); + } + + #[test] + fn test_html_load_from_path() { + let buffer = include_bytes!("../../../fixtures/sample.html"); + let html_loader = HtmlLoader::new( + Cursor::new(buffer), + Url::parse("https://example.com/").unwrap(), + ); + + let documents = html_loader.load().unwrap(); + + let expected = "Example Domain\n\n This domain is for use in illustrative examples in \ + documents. You may\n use this domain in literature without prior \ + coordination or asking for\n permission.\n More information..."; + + assert_eq!(documents.len(), 1); + assert_eq!( + documents[0].metadata.get("source").unwrap(), + &Value::from("https://example.com/") + ); + assert_eq!(documents[0].page_content, expected); + } +} diff --git a/packages/common/native/src/doc_loader/loader/mod.rs b/packages/common/native/src/doc_loader/loader/mod.rs new file mode 100644 index 0000000000..2b26d2ddf4 --- /dev/null +++ b/packages/common/native/src/doc_loader/loader/mod.rs @@ -0,0 +1,33 @@ +mod docx; +mod error; +mod html; +mod pdf; +mod source; +mod text; + +use std::io::{Read, Seek}; + +use super::*; + +// modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders +pub trait Loader: Send + Sync { + fn load(self) -> Result, LoaderError>; + fn load_and_split( + self, + splitter: TS, + ) -> Result, LoaderError> + where + Self: Sized, + { + let docs = self.load()?; + Ok(splitter.split_documents(&docs)?) + } +} + +pub use docx::DocxLoader; +pub use error::{LoaderError, LoaderResult}; +pub use html::HtmlLoader; +pub use pdf::PdfExtractLoader; +pub use source::{get_language_by_filename, LanguageParserOptions, SourceCodeLoader}; +pub use text::TextLoader; +pub use url::Url; diff --git a/packages/common/native/src/doc_loader/loader/pdf.rs b/packages/common/native/src/doc_loader/loader/pdf.rs new file mode 100644 index 0000000000..83a240c469 --- /dev/null +++ b/packages/common/native/src/doc_loader/loader/pdf.rs @@ -0,0 +1,70 @@ +use pdf_extract::{output_doc, output_doc_encrypted, PlainTextOutput}; + +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders + */ +use super::*; + +#[derive(Debug, Clone)] +pub struct PdfExtractLoader { + document: pdf_extract::Document, +} + +impl PdfExtractLoader { + pub fn new(reader: R) -> Result { + let document = pdf_extract::Document::load_from(reader) + .map_err(|e| LoaderError::OtherError(e.to_string()))?; + Ok(Self { document }) + } +} + +impl PdfExtractLoader { + fn extract_text(&self) -> Result { + let mut doc = self.document.clone(); + let mut buffer: Vec = Vec::new(); + let mut output = PlainTextOutput::new(&mut buffer as &mut dyn std::io::Write); + if doc.is_encrypted() { + output_doc_encrypted(&mut doc, &mut output, "")?; + } else { + output_doc(&doc, &mut output)?; + } + Ok(String::from_utf8(buffer)?) + } + + fn extract_text_to_doc(&self) -> Result { + let text = self.extract_text()?; + Ok(Document::new(text)) + } +} + +impl Loader for PdfExtractLoader { + fn load(self) -> Result, LoaderError> { + let doc = self.extract_text_to_doc()?; + Ok(vec![doc]) + } +} + +#[cfg(test)] +mod tests { + use std::{fs::read, io::Cursor, path::PathBuf}; + + use super::*; + + #[test] + fn test_parse_pdf() { + let fixtures = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures"); + let buffer = read(fixtures.join("sample.pdf")).unwrap(); + + let reader = Cursor::new(buffer); + let loader = PdfExtractLoader::new(reader).expect("Failed to create PdfExtractLoader"); + + let docs = loader.load().unwrap(); + + assert_eq!(docs.len(), 1); + assert_eq!( + &docs[0].page_content[..100], + "\n\nSample PDF\nThis is a simple PDF file. Fun fun fun.\n\nLorem ipsum dolor sit amet, \ + consectetuer a" + ); + } +} diff --git a/packages/common/native/src/doc_loader/loader/source/mod.rs b/packages/common/native/src/doc_loader/loader/source/mod.rs new file mode 100644 index 0000000000..4d93c75f96 --- /dev/null +++ b/packages/common/native/src/doc_loader/loader/source/mod.rs @@ -0,0 +1,61 @@ +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders + */ +mod parser; + +pub use parser::{get_language_by_filename, LanguageParser, LanguageParserOptions}; + +use super::*; + +#[derive(Debug, Clone)] +pub struct SourceCodeLoader { + content: String, + parser_option: LanguageParserOptions, +} + +impl SourceCodeLoader { + pub fn from_string>(input: S) -> Self { + Self { + content: input.into(), + parser_option: LanguageParserOptions::default(), + } + } +} + +impl SourceCodeLoader { + pub fn with_parser_option(mut self, parser_option: LanguageParserOptions) -> Self { + self.parser_option = parser_option; + self + } +} + +impl Loader for SourceCodeLoader { + fn load(self) -> Result, LoaderError> { + let options = self.parser_option.clone(); + + let docs = LanguageParser::from_language(options.language) + .with_parser_threshold(options.parser_threshold) + .parse_code(&self.content)?; + + Ok(docs) + } +} + +#[cfg(test)] +mod tests { + use parser::Language; + + use super::*; + + #[test] + fn test_source_code_loader() { + let content = include_str!("../../../../fixtures/sample.rs"); + let loader = SourceCodeLoader::from_string(content).with_parser_option(LanguageParserOptions { + language: Language::Rust, + ..Default::default() + }); + + let documents_with_content = loader.load().unwrap(); + assert_eq!(documents_with_content.len(), 1); + } +} diff --git a/packages/common/native/src/doc_loader/loader/source/parser.rs b/packages/common/native/src/doc_loader/loader/source/parser.rs new file mode 100644 index 0000000000..ab08f06674 --- /dev/null +++ b/packages/common/native/src/doc_loader/loader/source/parser.rs @@ -0,0 +1,246 @@ +use std::{collections::HashMap, fmt::Debug, string::ToString}; + +use strum_macros::Display; +use tree_sitter::{Parser, Tree}; + +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders + */ +use super::*; + +#[derive(Display, Debug, Clone)] +pub enum Language { + Rust, + C, + Cpp, + Javascript, + Typescript, + Go, + Python, +} + +pub enum LanguageContentTypes { + SimplifiedCode, + FunctionsImpls, +} + +impl std::fmt::Display for LanguageContentTypes { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + LanguageContentTypes::SimplifiedCode => "simplified_code", + LanguageContentTypes::FunctionsImpls => "functions_impls", + } + ) + } +} + +#[derive(Debug, Clone)] +pub struct LanguageParserOptions { + pub parser_threshold: u64, + pub language: Language, +} + +impl Default for LanguageParserOptions { + fn default() -> Self { + Self { + parser_threshold: 1000, + language: Language::Rust, + } + } +} + +pub struct LanguageParser { + parser: Parser, + parser_options: LanguageParserOptions, +} + +impl Debug for LanguageParser { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "LanguageParser {{ language: {:?} }}", + self.parser_options.language + ) + } +} + +impl Clone for LanguageParser { + fn clone(&self) -> Self { + LanguageParser { + parser: get_language_parser(&self.parser_options.language), + parser_options: self.parser_options.clone(), + } + } +} + +pub fn get_language_by_filename(name: &str) -> LoaderResult { + let extension = name + .split('.') + .last() + .ok_or(LoaderError::UnsupportedLanguage)?; + let language = match extension.to_lowercase().as_str() { + "rs" => Language::Rust, + "c" => Language::C, + "cpp" => Language::Cpp, + "h" => Language::C, + "hpp" => Language::Cpp, + "js" => Language::Javascript, + "ts" => Language::Typescript, + "tsx" => Language::Typescript, + "go" => Language::Go, + "py" => Language::Python, + _ => return Err(LoaderError::UnsupportedLanguage), + }; + Ok(language) +} + +fn get_language_parser(language: &Language) -> Parser { + let mut parser = Parser::new(); + let lang = match language { + Language::Rust => tree_sitter_rust::LANGUAGE, + Language::C => tree_sitter_c::LANGUAGE, + Language::Cpp => tree_sitter_cpp::LANGUAGE, + Language::Javascript => tree_sitter_javascript::LANGUAGE, + Language::Typescript => tree_sitter_typescript::LANGUAGE_TSX, + Language::Go => tree_sitter_go::LANGUAGE, + Language::Python => tree_sitter_python::LANGUAGE, + }; + parser + .set_language(&lang.into()) + .unwrap_or_else(|_| panic!("Error loading grammar for language: {:?}", language)); + parser +} + +impl LanguageParser { + pub fn from_language(language: Language) -> Self { + Self { + parser: get_language_parser(&language), + parser_options: LanguageParserOptions { + language, + ..LanguageParserOptions::default() + }, + } + } + + pub fn with_parser_threshold(mut self, threshold: u64) -> Self { + self.parser_options.parser_threshold = threshold; + self + } +} + +impl LanguageParser { + pub fn parse_code(&mut self, code: &String) -> LoaderResult> { + let tree = self + .parser + .parse(code, None) + .ok_or(LoaderError::UnsupportedLanguage)?; + if self.parser_options.parser_threshold > tree.root_node().end_position().row as u64 { + return Ok(vec![Document::new(code).with_metadata(HashMap::from([ + ( + "content_type".to_string(), + serde_json::Value::from(LanguageContentTypes::SimplifiedCode.to_string()), + ), + ( + "language".to_string(), + serde_json::Value::from(self.parser_options.language.to_string()), + ), + ]))]); + } + self.extract_functions_classes(tree, code) + } + + pub fn extract_functions_classes( + &self, + tree: Tree, + code: &String, + ) -> LoaderResult> { + let mut chunks = Vec::new(); + + let count = tree.root_node().child_count(); + for i in 0..count { + let Some(node) = tree.root_node().child(i) else { + continue; + }; + let source_code = node.utf8_text(code.as_bytes())?.to_string(); + let lang_meta = ( + "language".to_string(), + serde_json::Value::from(self.parser_options.language.to_string()), + ); + if node.kind() == "function_item" || node.kind() == "impl_item" { + let doc = Document::new(source_code).with_metadata(HashMap::from([ + lang_meta.clone(), + ( + "content_type".to_string(), + serde_json::Value::from(LanguageContentTypes::FunctionsImpls.to_string()), + ), + ])); + chunks.push(doc); + } else { + let doc = Document::new(source_code).with_metadata(HashMap::from([ + lang_meta.clone(), + ( + "content_type".to_string(), + serde_json::Value::from(LanguageContentTypes::SimplifiedCode.to_string()), + ), + ])); + chunks.push(doc); + } + } + Ok(chunks) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_code_parser() { + let code = r#" + fn main() { + println!("Hello, world!"); + } + + pub struct Person { + name: String, + age: i32, + } + + impl Person { + pub fn new(name: String, age: i32) -> Self { + Self { name, age } + } + + pub fn get_name(&self) -> &str { + &self.name + } + + pub fn get_age(&self) -> i32 { + self.age + } + } + "#; + + let mut parser = LanguageParser::from_language(Language::Rust); + + let documents = parser.parse_code(&code.to_string()).unwrap(); + assert_eq!(documents.len(), 1); + + // Set the parser threshold to 10 for testing + let mut parser = parser.with_parser_threshold(10); + + let documents = parser.parse_code(&code.to_string()).unwrap(); + assert_eq!(documents.len(), 3); + assert_eq!( + documents[0].page_content, + "fn main() {\n println!(\"Hello, world!\");\n }" + ); + assert_eq!( + documents[1].metadata.get("content_type").unwrap(), + LanguageContentTypes::SimplifiedCode.to_string().as_str() + ); + } +} diff --git a/packages/common/native/src/doc_loader/loader/text.rs b/packages/common/native/src/doc_loader/loader/text.rs new file mode 100644 index 0000000000..0cb42e205d --- /dev/null +++ b/packages/common/native/src/doc_loader/loader/text.rs @@ -0,0 +1,24 @@ +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/document_loaders + */ +use super::*; + +#[derive(Debug, Clone)] +pub struct TextLoader { + content: String, +} + +impl TextLoader { + pub fn new>(input: T) -> Self { + Self { + content: input.into(), + } + } +} + +impl Loader for TextLoader { + fn load(self) -> Result, LoaderError> { + let doc = Document::new(self.content); + Ok(vec![doc]) + } +} diff --git a/packages/common/native/src/doc_loader/mod.rs b/packages/common/native/src/doc_loader/mod.rs new file mode 100644 index 0000000000..625a898e09 --- /dev/null +++ b/packages/common/native/src/doc_loader/mod.rs @@ -0,0 +1,12 @@ +mod document; +mod loader; +mod splitter; +mod types; + +pub use document::{Chunk, Doc}; +use loader::{ + get_language_by_filename, DocxLoader, HtmlLoader, LanguageParserOptions, Loader, LoaderError, + PdfExtractLoader, SourceCodeLoader, TextLoader, Url, +}; +use splitter::{MarkdownSplitter, TextSplitter, TextSplitterError, TokenSplitter}; +use types::Document; diff --git a/packages/common/native/src/doc_loader/splitter/error.rs b/packages/common/native/src/doc_loader/splitter/error.rs new file mode 100644 index 0000000000..969ea34c60 --- /dev/null +++ b/packages/common/native/src/doc_loader/splitter/error.rs @@ -0,0 +1,35 @@ +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/text_splitter + */ +use text_splitter::ChunkConfigError; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum TextSplitterError { + #[error("Empty input text")] + EmptyInputText, + + #[error("Mismatch metadata and text")] + MetadataTextMismatch, + + #[error("Tokenizer not found")] + TokenizerNotFound, + + #[error("Tokenizer creation failed due to invalid tokenizer")] + InvalidTokenizer, + + #[error("Tokenizer creation failed due to invalid model")] + InvalidModel, + + #[error("Invalid chunk overlap and size")] + InvalidSplitterOptions, + + #[error("Error: {0}")] + OtherError(String), +} + +impl From for TextSplitterError { + fn from(_: ChunkConfigError) -> Self { + Self::InvalidSplitterOptions + } +} diff --git a/packages/common/native/src/doc_loader/splitter/markdown.rs b/packages/common/native/src/doc_loader/splitter/markdown.rs new file mode 100644 index 0000000000..7fbff1d1c2 --- /dev/null +++ b/packages/common/native/src/doc_loader/splitter/markdown.rs @@ -0,0 +1,36 @@ +use text_splitter::ChunkConfig; + +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/text_splitter + */ +use super::*; + +pub struct MarkdownSplitter { + splitter_options: SplitterOptions, +} + +impl Default for MarkdownSplitter { + fn default() -> Self { + MarkdownSplitter::new(SplitterOptions::default()) + } +} + +impl MarkdownSplitter { + pub fn new(options: SplitterOptions) -> MarkdownSplitter { + MarkdownSplitter { + splitter_options: options, + } + } +} + +impl TextSplitter for MarkdownSplitter { + fn split_text(&self, text: &str) -> Result, TextSplitterError> { + let chunk_config = ChunkConfig::try_from(&self.splitter_options)?; + Ok( + text_splitter::MarkdownSplitter::new(chunk_config) + .chunks(text) + .map(|x| x.to_string()) + .collect(), + ) + } +} diff --git a/packages/common/native/src/doc_loader/splitter/mod.rs b/packages/common/native/src/doc_loader/splitter/mod.rs new file mode 100644 index 0000000000..65194c8aa8 --- /dev/null +++ b/packages/common/native/src/doc_loader/splitter/mod.rs @@ -0,0 +1,58 @@ +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/text_splitter + */ +mod error; +mod markdown; +mod options; +mod token; + +use std::collections::HashMap; + +pub use error::TextSplitterError; +pub use markdown::MarkdownSplitter; +use options::SplitterOptions; +use serde_json::Value; +pub use token::TokenSplitter; + +use super::*; + +pub trait TextSplitter: Send + Sync { + fn split_text(&self, text: &str) -> Result, TextSplitterError>; + + fn split_documents(&self, documents: &[Document]) -> Result, TextSplitterError> { + let mut texts: Vec = Vec::new(); + let mut metadatas: Vec> = Vec::new(); + documents.iter().for_each(|d| { + texts.push(d.page_content.clone()); + metadatas.push(d.metadata.clone()); + }); + + self.create_documents(&texts, &metadatas) + } + + fn create_documents( + &self, + text: &[String], + metadatas: &[HashMap], + ) -> Result, TextSplitterError> { + let mut metadatas = metadatas.to_vec(); + if metadatas.is_empty() { + metadatas = vec![HashMap::new(); text.len()]; + } + + if text.len() != metadatas.len() { + return Err(TextSplitterError::MetadataTextMismatch); + } + + let mut documents: Vec = Vec::new(); + for i in 0..text.len() { + let chunks = self.split_text(&text[i])?; + for chunk in chunks { + let document = Document::new(chunk).with_metadata(metadatas[i].clone()); + documents.push(document); + } + } + + Ok(documents) + } +} diff --git a/packages/common/native/src/doc_loader/splitter/options.rs b/packages/common/native/src/doc_loader/splitter/options.rs new file mode 100644 index 0000000000..e5b6dfa840 --- /dev/null +++ b/packages/common/native/src/doc_loader/splitter/options.rs @@ -0,0 +1,96 @@ +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/text_splitter + */ +use text_splitter::ChunkConfig; +use tiktoken_rs::{get_bpe_from_model, get_bpe_from_tokenizer, tokenizer::Tokenizer, CoreBPE}; + +use super::TextSplitterError; + +// Options is a struct that contains options for a text splitter. +#[derive(Debug, Clone)] +pub struct SplitterOptions { + pub chunk_size: usize, + pub chunk_overlap: usize, + pub model_name: String, + pub encoding_name: String, + pub trim_chunks: bool, +} + +impl Default for SplitterOptions { + fn default() -> Self { + Self::new() + } +} + +impl SplitterOptions { + pub fn new() -> Self { + SplitterOptions { + chunk_size: 512, + chunk_overlap: 0, + model_name: String::from("gpt-3.5-turbo"), + encoding_name: String::from("cl100k_base"), + trim_chunks: false, + } + } +} + +// Builder pattern for Options struct +impl SplitterOptions { + pub fn with_chunk_size(mut self, chunk_size: usize) -> Self { + self.chunk_size = chunk_size; + self + } + + pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self { + self.chunk_overlap = chunk_overlap; + self + } + + pub fn with_model_name(mut self, model_name: &str) -> Self { + self.model_name = String::from(model_name); + self + } + + pub fn with_encoding_name(mut self, encoding_name: &str) -> Self { + self.encoding_name = String::from(encoding_name); + self + } + + pub fn with_trim_chunks(mut self, trim_chunks: bool) -> Self { + self.trim_chunks = trim_chunks; + self + } + + pub fn get_tokenizer_from_str(s: &str) -> Option { + match s.to_lowercase().as_str() { + "cl100k_base" => Some(Tokenizer::Cl100kBase), + "p50k_base" => Some(Tokenizer::P50kBase), + "r50k_base" => Some(Tokenizer::R50kBase), + "p50k_edit" => Some(Tokenizer::P50kEdit), + "gpt2" => Some(Tokenizer::Gpt2), + _ => None, + } + } +} + +impl TryFrom<&SplitterOptions> for ChunkConfig { + type Error = TextSplitterError; + + fn try_from(options: &SplitterOptions) -> Result { + let tk = if !options.encoding_name.is_empty() { + let tokenizer = SplitterOptions::get_tokenizer_from_str(&options.encoding_name) + .ok_or(TextSplitterError::TokenizerNotFound)?; + + get_bpe_from_tokenizer(tokenizer).map_err(|_| TextSplitterError::InvalidTokenizer)? + } else { + get_bpe_from_model(&options.model_name).map_err(|_| TextSplitterError::InvalidModel)? + }; + + Ok( + ChunkConfig::new(options.chunk_size) + .with_sizer(tk) + .with_trim(options.trim_chunks) + .with_overlap(options.chunk_overlap)?, + ) + } +} diff --git a/packages/common/native/src/doc_loader/splitter/token.rs b/packages/common/native/src/doc_loader/splitter/token.rs new file mode 100644 index 0000000000..26d90150db --- /dev/null +++ b/packages/common/native/src/doc_loader/splitter/token.rs @@ -0,0 +1,37 @@ +use text_splitter::ChunkConfig; + +/** + * modified from https://github.com/Abraxas-365/langchain-rust/tree/v4.6.0/src/text_splitter + */ +use super::*; + +#[derive(Debug, Clone)] +pub struct TokenSplitter { + splitter_options: SplitterOptions, +} + +impl Default for TokenSplitter { + fn default() -> Self { + TokenSplitter::new(SplitterOptions::default()) + } +} + +impl TokenSplitter { + pub fn new(options: SplitterOptions) -> TokenSplitter { + TokenSplitter { + splitter_options: options, + } + } +} + +impl TextSplitter for TokenSplitter { + fn split_text(&self, text: &str) -> Result, TextSplitterError> { + let chunk_config = ChunkConfig::try_from(&self.splitter_options)?; + Ok( + text_splitter::TextSplitter::new(chunk_config) + .chunks(text) + .map(|x| x.to_string()) + .collect(), + ) + } +} diff --git a/packages/common/native/src/doc_loader/types.rs b/packages/common/native/src/doc_loader/types.rs new file mode 100644 index 0000000000..c2955ece88 --- /dev/null +++ b/packages/common/native/src/doc_loader/types.rs @@ -0,0 +1,37 @@ +use std::collections::HashMap; + +use serde_json::Value; + +#[derive(Debug, Clone)] +pub struct Document { + pub page_content: String, + pub metadata: HashMap, +} + +impl Document { + /// Constructs a new `Document` with provided `page_content`, an empty + /// `metadata` map and a `score` of 0. + pub fn new>(page_content: S) -> Self { + Document { + page_content: page_content.into(), + metadata: HashMap::new(), + } + } + + /// Sets the `metadata` Map of the `Document` to the provided HashMap. + pub fn with_metadata(mut self, metadata: HashMap) -> Self { + self.metadata = metadata; + self + } +} + +impl Default for Document { + /// Provides a default `Document` with an empty `page_content`, an empty + /// `metadata` map and a `score` of 0. + fn default() -> Self { + Document { + page_content: "".to_string(), + metadata: HashMap::new(), + } + } +} diff --git a/packages/common/native/src/lib.rs b/packages/common/native/src/lib.rs index 908ae1a77b..d865fd7a21 100644 --- a/packages/common/native/src/lib.rs +++ b/packages/common/native/src/lib.rs @@ -1 +1,3 @@ +#[cfg(feature = "doc-loader")] +pub mod doc_loader; pub mod hashcash;