diff --git a/Cargo.lock b/Cargo.lock index 0341b4e..7e064ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,9 +82,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.44" +version = "1.2.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37521ac7aabe3d13122dc382493e20c9416f299d2ccd5b3a5340a2570cdeb0f3" +checksum = "35900b6c8d709fb1d854671ae27aeaa9eec2f8b01b364e1619a40da3e6fe2afe" dependencies = [ "find-msvc-tools", "shlex", @@ -423,9 +423,9 @@ checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] name = "hyper" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" +checksum = "1744436df46f0bde35af3eda22aeaba453aada65d8f1c171cd8a5f59030bd69f" dependencies = [ "atomic-waker", "bytes", @@ -602,9 +602,9 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "iri-string" -version = "0.7.8" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +checksum = "4f867b9d1d896b67beb18518eda36fdb77a32ea590de864f1325b294a6d14397" dependencies = [ "memchr", "serde", @@ -637,7 +637,7 @@ dependencies = [ [[package]] name = "kiru" -version = "0.1.10" +version = "0.1.11" dependencies = [ "criterion", "crossbeam-channel", @@ -653,7 +653,7 @@ dependencies = [ [[package]] name = "kiru-py" -version = "0.1.10" +version = "0.1.11" dependencies = [ "kiru", "pyo3", @@ -958,9 +958,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.41" +version = "1.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" dependencies = [ "proc-macro2", ] @@ -1133,9 +1133,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.34" +version = "0.23.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9586e9ee2b4f8fab52a0048ca7334d7024eef48e2cb9407e3497bb7cab7fa7" +checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" dependencies = [ "once_cell", "ring", @@ -1296,9 +1296,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.108" +version = "2.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" +checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" dependencies = [ "proc-macro2", "quote", @@ -1658,9 +1658,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b130c0d2d49f8b6889abc456e795e82525204f27c42cf767cf0d7734e089b8" +checksum = "b2878ef029c47c6e8cf779119f20fcf52bde7ad42a731b2a304bc221df17571e" dependencies = [ "rustls-pki-types", ] diff --git a/kiru-core/Cargo.toml b/kiru-core/Cargo.toml index 45671f0..a58747f 100644 --- a/kiru-core/Cargo.toml +++ b/kiru-core/Cargo.toml @@ -1,7 +1,7 @@ # kiru/Cargo.toml [package] name = "kiru" -version = "0.1.10" +version = "0.1.11" edition = "2021" description = "Fast text chunking for Rust" license = "MIT" diff --git a/kiru-core/benches/par_chunking.rs b/kiru-core/benches/par_chunking.rs index 62b6e3d..d53f35b 100644 --- a/kiru-core/benches/par_chunking.rs +++ b/kiru-core/benches/par_chunking.rs @@ -1,5 +1,5 @@ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -use kiru::{ChunkerBuilder, ChunkerEnum, Source}; +use kiru::{ChunkerBuilder, Source}; use std::fs; use std::hint::black_box; use std::time::Duration; @@ -33,10 +33,7 @@ fn benchmark_bytes_chunking(c: &mut Criterion) { group.bench_function("serial_single", |b| { let source = Source::File(LARGE_FILE_PATH.to_string()); b.iter(|| { - let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let iter = chunker.on_source(source.clone()).unwrap(); let chunks: Vec<_> = iter.collect(); black_box(chunks); @@ -54,10 +51,7 @@ fn benchmark_bytes_chunking(c: &mut Criterion) { &sources, |b, sources| { b.iter(|| { - let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let iter = chunker.on_sources(sources.clone()).unwrap(); let chunks: Vec<_> = iter.collect(); black_box(chunks); @@ -71,10 +65,7 @@ fn benchmark_bytes_chunking(c: &mut Criterion) { &sources, |b, sources| { b.iter(|| { - let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let chunks: Vec<_> = chunker.on_sources_par(sources.clone()).unwrap(); black_box(chunks); }); @@ -87,10 +78,7 @@ fn benchmark_bytes_chunking(c: &mut Criterion) { &sources, |b, sources| { b.iter(|| { - let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let iter = chunker .on_sources_par_stream(sources.clone(), CHANNEL_SIZE) .unwrap(); @@ -122,10 +110,7 @@ fn benchmark_characters_chunking(c: &mut Criterion) { group.bench_function("serial_single", |b| { let source = Source::File(LARGE_FILE_PATH.to_string()); b.iter(|| { - let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let iter = chunker.on_source(source.clone()).unwrap(); let chunks: Vec<_> = iter.collect(); black_box(chunks); @@ -143,10 +128,7 @@ fn benchmark_characters_chunking(c: &mut Criterion) { &sources, |b, sources| { b.iter(|| { - let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let iter = chunker.on_sources(sources.clone()).unwrap(); let chunks: Vec<_> = iter.collect(); black_box(chunks); @@ -160,10 +142,7 @@ fn benchmark_characters_chunking(c: &mut Criterion) { &sources, |b, sources| { b.iter(|| { - let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let chunks: Vec<_> = chunker.on_sources_par(sources.clone()).unwrap(); black_box(chunks); }); @@ -176,10 +155,7 @@ fn benchmark_characters_chunking(c: &mut Criterion) { &sources, |b, sources| { b.iter(|| { - let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let iter = chunker .on_sources_par_stream(sources.clone(), CHANNEL_SIZE) .unwrap(); @@ -219,10 +195,7 @@ fn benchmark_channel_size(c: &mut Criterion) { &sources, |b, sources| { b.iter(|| { - let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap(); let iter = chunker .on_sources_par_stream(sources.clone(), channel_size) .unwrap(); @@ -238,10 +211,7 @@ fn benchmark_channel_size(c: &mut Criterion) { &sources, |b, sources| { b.iter(|| { - let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters { - chunk_size: CHUNK_SIZE, - overlap: OVERLAP, - }); + let chunker = ChunkerBuilder::by_characters(CHUNK_SIZE, OVERLAP).unwrap(); let iter = chunker .on_sources_par_stream(sources.clone(), channel_size) .unwrap(); diff --git a/kiru-core/src/bin/benchmark.rs b/kiru-core/src/bin/benchmark.rs index 469d079..08f57c9 100644 --- a/kiru-core/src/bin/benchmark.rs +++ b/kiru-core/src/bin/benchmark.rs @@ -1,6 +1,6 @@ // kiru-core/src/bin/benchmark.rs -use kiru::{ChunkerBuilder, ChunkerEnum, Source}; +use kiru::{ChunkerBuilder, Source}; use serde::Serialize; use std::env; use std::time::Instant; @@ -77,21 +77,6 @@ fn run_benchmark( chunk_size: usize, overlap: usize, ) -> Result> { - // Create the chunker using ChunkerBuilder - let chunker = match strategy { - "bytes" => ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size, - overlap, - }), - "chars" => ChunkerBuilder::by_characters(ChunkerEnum::Characters { - chunk_size, - overlap, - }), - _ => { - return Err(format!("Invalid strategy '{}'. Use 'bytes' or 'chars'", strategy).into()); - } - }; - // Parse the source based on source_type let source = match source_type { "file" => Source::File(path.to_string()), @@ -106,21 +91,42 @@ fn run_benchmark( } }; - // Run the benchmark + // Create the chunker using ChunkerBuilder + match strategy { + "bytes" => { + let chunker = ChunkerBuilder::by_bytes(chunk_size, overlap)?; + bench_with(chunker, source) + } + "chars" => { + let chunker = ChunkerBuilder::by_characters(chunk_size, overlap)?; + bench_with(chunker, source) + } + _ => { + Err(format!("Invalid strategy '{}'. Use 'bytes' or 'chars'", strategy).into()) + } + } +} + +// Generic benchmarking body specialized for the concrete chunker type. +fn bench_with( + chunker: kiru::ChunkerWithStrategy, + source: Source, +) -> Result> +where + C: kiru::Chunker, +{ let start = Instant::now(); - let mut num_chunks = 0; - let mut total_bytes = 0; + let mut num_chunks = 0usize; + let mut total_bytes = 0usize; let iterator = chunker.on_source(source)?; - for chunk in iterator { num_chunks += 1; total_bytes += chunk.len(); std::hint::black_box(chunk.len()); } - let elapsed = start.elapsed(); - let elapsed_secs = elapsed.as_secs_f64(); + let elapsed_secs = start.elapsed().as_secs_f64(); let throughput_mb_s = (total_bytes as f64) / (1024.0 * 1024.0) / elapsed_secs; Ok(BenchmarkResult { diff --git a/kiru-core/src/bytes_chunker.rs b/kiru-core/src/bytes_chunker.rs index 2adc424..8f872e5 100644 --- a/kiru-core/src/bytes_chunker.rs +++ b/kiru-core/src/bytes_chunker.rs @@ -6,6 +6,7 @@ struct BytesChunkIndices { new_position: usize, } +#[derive(Clone)] pub struct BytesChunker { chunk_size: usize, overlap: usize, diff --git a/kiru-core/src/characters_chunker.rs b/kiru-core/src/characters_chunker.rs index b0f6450..b342f7a 100644 --- a/kiru-core/src/characters_chunker.rs +++ b/kiru-core/src/characters_chunker.rs @@ -15,6 +15,7 @@ struct CharactersChunkIndices { new_char_position: usize, } +#[derive(Clone)] pub struct CharactersChunker { chunk_size: usize, overlap: usize, diff --git a/kiru-core/src/chunker.rs b/kiru-core/src/chunker.rs index 06d0a70..c96ab81 100644 --- a/kiru-core/src/chunker.rs +++ b/kiru-core/src/chunker.rs @@ -159,7 +159,7 @@ where } } -pub trait Chunker { +pub trait Chunker: Clone + Sync + Send + 'static { fn chunk_string(self, input: String) -> impl Iterator; fn chunk_stream(self, input: impl Iterator) -> impl Iterator; } @@ -173,58 +173,48 @@ pub enum ChunkerEnum { pub struct ChunkerBuilder {} impl ChunkerBuilder { - pub fn by_bytes(chunker_params: ChunkerEnum) -> ChunkerWithStrategy { - ChunkerWithStrategy { chunker_params } + pub fn by_bytes( + chunk_size: usize, + overlap: usize, + ) -> Result, ChunkingError> { + Ok(ChunkerWithStrategy { + chunker: BytesChunker::new(chunk_size, overlap)?, + }) } - pub fn by_characters(chunker_params: ChunkerEnum) -> ChunkerWithStrategy { - ChunkerWithStrategy { chunker_params } + pub fn by_characters( + chunk_size: usize, + overlap: usize, + ) -> Result, ChunkingError> { + Ok(ChunkerWithStrategy { + chunker: CharactersChunker::new(chunk_size, overlap)?, + }) } } // Update ChunkerWithStrategy to use ChunkerEnum -pub struct ChunkerWithStrategy { - chunker_params: ChunkerEnum, +pub struct ChunkerWithStrategy { + chunker: C, } -impl ChunkerWithStrategy { - pub fn on_source( - &self, - source: Source, - ) -> Result + Send + Sync>, ChunkingError> { +impl ChunkerWithStrategy { + pub fn on_source(&self, source: Source) -> Result, ChunkingError> { let stream = StreamType::from_source(&source)?; - match self.chunker_params { - ChunkerEnum::Bytes { - chunk_size, - overlap, - } => { - let chunker = BytesChunker::new(chunk_size, overlap)?; - Ok(Box::new(chunker.chunk_stream(stream))) - } - ChunkerEnum::Characters { - chunk_size, - overlap, - } => { - let chunker = CharactersChunker::new(chunk_size, overlap)?; - Ok(Box::new(chunker.chunk_stream(stream))) - } - } + Ok(self.chunker.clone().chunk_stream(stream)) } pub fn on_sources( &self, sources: Vec, - ) -> Result + Send + Sync>, ChunkingError> { - let iterators: Vec + Send + Sync>> = sources + ) -> Result, ChunkingError> { + let iterators = sources .into_iter() .map(|s| self.on_source(s)) .collect::, _>>()?; // Chain all iterators together - let chained = iterators.into_iter().flatten(); - - Ok(Box::new(chained)) + Ok(iterators.into_iter().flatten()) } pub fn on_sources_par(&self, sources: Vec) -> Result, ChunkingError> { @@ -243,26 +233,23 @@ impl ChunkerWithStrategy { &self, sources: Vec, channel_size: usize, - ) -> Result + Send + Sync>, ChunkingError> { + ) -> Result, ChunkingError> { // Pre-validate: check all sources are accessible for source in &sources { StreamType::from_source(source)?; // This validates the source } let (sender, receiver) = bounded(channel_size); - let chunker_params = self.chunker_params.clone(); + let chunker = self.chunker.clone(); thread::spawn({ move || { sources.into_par_iter().for_each(|source| { let sender = sender.clone(); - let chunker = ChunkerWithStrategy { - chunker_params: chunker_params.clone(), - }; // Should not fail since we pre-validated - if let Ok(iter) = chunker.on_source(source) { - for chunk in iter { + if let Ok(stream) = StreamType::from_source(&source) { + for chunk in chunker.clone().chunk_stream(stream) { if sender.send(chunk).is_err() { break; } @@ -294,12 +281,10 @@ mod tests { println!("{:?}", sources); - let u = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size: 1024, - overlap: 128, - }) - .on_sources_par_stream(sources, 1000) - .unwrap(); + let u = ChunkerBuilder::by_bytes(1024, 128) + .unwrap() + .on_sources_par_stream(sources, 1000) + .unwrap(); // Add assertions here for chunk in u { @@ -318,10 +303,7 @@ mod tests { let sources = HigherOrderSource::into_flattened_sources(sources).unwrap(); let start = Instant::now(); - let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size: 1024, - overlap: 128, - }); + let chunker = ChunkerBuilder::by_bytes(1024, 128).unwrap(); let chunks = chunker .on_sources_par_stream(sources, 10000) .unwrap() diff --git a/kiru-py/Cargo.toml b/kiru-py/Cargo.toml index 5c71546..f40fd1f 100644 --- a/kiru-py/Cargo.toml +++ b/kiru-py/Cargo.toml @@ -1,7 +1,7 @@ # kiru-py/Cargo.toml [package] name = "kiru-py" -version = "0.1.10" +version = "0.1.11" edition = "2021" description = "Python bindings for kiru text chunking library" repository = "https://github.com/bitswired/kiru" @@ -17,6 +17,6 @@ pyo3 = { workspace = true } [dependencies.kiru] path = "../kiru-core" -version = "0.1.10" +version = "0.1.11" [build-dependencies] # None needed for simple bindings diff --git a/kiru-py/benchmark_results.csv b/kiru-py/benchmark_results.csv index e282807..923dc55 100644 --- a/kiru-py/benchmark_results.csv +++ b/kiru-py/benchmark_results.csv @@ -1,121 +1,121 @@ library,strategy,source,file_size_mb,chunk_size,overlap,run,time_s,memory_mb,throughput_mb_s -kiru-native,chars,string,1.0,1048576,1024,0,0.002980792,,335.4813083234255 -kiru-native,chars,string,1.0,1048576,1024,0,0.004129,,242.18939210462582 -kiru-native,chars,string,1.0,1048576,1024,0,0.003819875,,261.78867109525834 -kiru-native,chars,string,1.0,1048576,4096,0,0.00345175,,289.70811907003696 -kiru-native,chars,string,1.0,1048576,4096,0,0.003440333,,290.66953693145405 -kiru-native,chars,string,1.0,1048576,4096,0,0.003358,,297.79630732578914 -kiru-native,chars,string,1.0,1048576,8192,0,0.003233417,,309.2703477466717 -kiru-native,chars,string,1.0,1048576,8192,0,0.003172542,,315.2046529250046 -kiru-native,chars,string,1.0,1048576,8192,0,0.003276459,,305.20754265504314 -kiru-native,chars,string,1.0,1048576,16384,0,0.003188959,,313.58195574166996 -kiru-native,chars,string,1.0,1048576,16384,0,0.003206125,,311.9029981675699 -kiru-native,chars,string,1.0,1048576,16384,0,0.003052167,,327.636069717024 -kiru-native,chars,file,1.0,1048576,1024,0,0.003338459,,299.5393982672844 -kiru-native,chars,file,1.0,1048576,1024,0,0.003477458,,287.5663775090885 -kiru-native,chars,file,1.0,1048576,1024,0,0.003577375,,279.53457493273703 -kiru-native,chars,file,1.0,1048576,4096,0,0.003531458,,283.16916129258794 -kiru-native,chars,file,1.0,1048576,4096,0,0.003395541,,294.503880235874 -kiru-native,chars,file,1.0,1048576,4096,0,0.003443209,,290.4267501624212 -kiru-native,chars,file,1.0,1048576,8192,0,0.003697875,,270.4255822600818 -kiru-native,chars,file,1.0,1048576,8192,0,0.003548666,,281.79603265001555 -kiru-native,chars,file,1.0,1048576,8192,0,0.003420083,,292.3905647903867 -kiru-native,chars,file,1.0,1048576,16384,0,0.003384042,,295.50460662131263 -kiru-native,chars,file,1.0,1048576,16384,0,0.003471167,,288.0875509590867 -kiru-native,chars,file,1.0,1048576,16384,0,0.003655,,273.5978112175103 -kiru-native,bytes,string,1.0,1048576,1024,0,0.000222167,,4501.118527954197 -kiru-native,bytes,string,1.0,1048576,1024,0,0.000210292,,4755.292640709109 -kiru-native,bytes,string,1.0,1048576,1024,0,0.000245792,,4068.4806665798724 -kiru-native,bytes,string,1.0,1048576,4096,0,0.000228791,,4370.801299002146 -kiru-native,bytes,string,1.0,1048576,4096,0,0.000218667,,4573.163760421097 -kiru-native,bytes,string,1.0,1048576,4096,0,0.000333417,,2999.247188955572 -kiru-native,bytes,string,1.0,1048576,8192,0,0.000250208,,3996.674766594194 -kiru-native,bytes,string,1.0,1048576,8192,0,0.000338458,,2954.576343298135 -kiru-native,bytes,string,1.0,1048576,8192,0,0.000247458,,4041.08980109756 -kiru-native,bytes,string,1.0,1048576,16384,0,0.000224334,,4457.639056050353 -kiru-native,bytes,string,1.0,1048576,16384,0,0.000345333,,2895.7556908838715 -kiru-native,bytes,string,1.0,1048576,16384,0,0.000234292,,4268.178170829563 -kiru-native,bytes,file,1.0,1048576,1024,0,0.001057583,,945.5522639830633 -kiru-native,bytes,file,1.0,1048576,1024,0,0.000977083,,1023.4545069354396 -kiru-native,bytes,file,1.0,1048576,1024,0,0.001037667,,963.7003007708639 -kiru-native,bytes,file,1.0,1048576,4096,0,0.000933833,,1070.855281404705 -kiru-native,bytes,file,1.0,1048576,4096,0,0.001102792,,906.7893129438734 -kiru-native,bytes,file,1.0,1048576,4096,0,0.000987417,,1012.7433495676092 -kiru-native,bytes,file,1.0,1048576,8192,0,0.001025875,,974.7776288534177 -kiru-native,bytes,file,1.0,1048576,8192,0,0.000954083,,1048.1268401176837 -kiru-native,bytes,file,1.0,1048576,8192,0,0.000950416,,1052.1708388747663 -kiru-native,bytes,file,1.0,1048576,16384,0,0.000927708,,1077.9253816933776 -kiru-native,bytes,file,1.0,1048576,16384,0,0.000933,,1071.8113612004288 -kiru-native,bytes,file,1.0,1048576,16384,0,0.000935291,,1069.1859538902866 -kiru,chars,string,1.0,1048576,1024,0,0.004347917012637481,3.9179420471191406,229.99519013206557 -kiru,chars,string,1.0,1048576,1024,0,0.003130500001134351,2.9384889602661133,319.437789374747 -kiru,chars,string,1.0,1048576,1024,0,0.0030077499977778643,2.9384889602661133,332.4744412729792 -kiru,chars,string,1.0,1048576,4096,0,0.0033740829967428,2.9384889602661133,296.3768232629009 -kiru,chars,string,1.0,1048576,4096,0,0.0031775419774930924,2.9384889602661133,314.7086669769019 -kiru,chars,string,1.0,1048576,4096,0,0.003113624989055097,2.9384889602661133,321.1690564904779 -kiru,chars,string,1.0,1048576,8192,0,0.0032195829844567925,2.9384889602661133,310.59923127551247 -kiru,chars,string,1.0,1048576,8192,0,0.0032872920273803174,2.9384889602661133,304.2017538055212 -kiru,chars,string,1.0,1048576,8192,0,0.0031174579926300794,2.9384889602661133,320.77416996927633 -kiru,chars,string,1.0,1048576,16384,0,0.003126666008029133,2.9384889602661133,319.82949167965063 -kiru,chars,string,1.0,1048576,16384,0,0.0032596670207567513,2.9384889602661133,306.77980101410606 -kiru,chars,string,1.0,1048576,16384,0,0.0029991670162416995,2.9384889602661133,333.42591278998356 -kiru,chars,file,1.0,1048576,1024,0,0.0033220839977730066,3.0001325607299805,301.0158685543052 -kiru,chars,file,1.0,1048576,1024,0,0.003645290998974815,3.0001325607299805,274.3265216086275 -kiru,chars,file,1.0,1048576,1024,0,0.003079042013268918,3.0001325607299805,324.7763413719493 -kiru,chars,file,1.0,1048576,4096,0,0.003104874980635941,3.0001325607299805,322.07415958344956 -kiru,chars,file,1.0,1048576,4096,0,0.003110332996584475,3.0001325607299805,321.508983474799 -kiru,chars,file,1.0,1048576,4096,0,0.003057583002373576,3.0001325607299805,327.05571663098215 -kiru,chars,file,1.0,1048576,8192,0,0.003048040991416201,3.0001325607299805,328.0795772813322 -kiru,chars,file,1.0,1048576,8192,0,0.0033030420017894357,3.0001325607299805,302.7512212857863 -kiru,chars,file,1.0,1048576,8192,0,0.003511957998853177,3.0001325607299805,284.74144631756644 -kiru,chars,file,1.0,1048576,16384,0,0.003222999977879226,3.0001325607299805,310.2699369728238 -kiru,chars,file,1.0,1048576,16384,0,0.0032509999873582274,3.0001325607299805,307.5976634538849 -kiru,chars,file,1.0,1048576,16384,0,0.0032008750131353736,3.0001325607299805,312.41457285783355 -kiru,bytes,string,1.0,1048576,1024,0,0.0008377920021302998,2.9384889602661133,1193.613686281613 -kiru,bytes,string,1.0,1048576,1024,0,0.0007105419936124235,2.9384889602661133,1407.3763535297337 -kiru,bytes,string,1.0,1048576,1024,0,0.0007148750009946525,2.9384889602661133,1398.8459501432199 -kiru,bytes,string,1.0,1048576,4096,0,0.0007127909921109676,2.9384889602661133,1402.9357989478065 -kiru,bytes,string,1.0,1048576,4096,0,0.0006712500180583447,2.9384889602661133,1489.7578742606163 -kiru,bytes,string,1.0,1048576,4096,0,0.0006607919931411743,2.9384889602661133,1513.335528244447 -kiru,bytes,string,1.0,1048576,8192,0,0.0006650000286754221,2.9384889602661133,1503.7593336527314 -kiru,bytes,string,1.0,1048576,8192,0,0.0006990830006543547,2.9384889602661133,1430.4453105911334 -kiru,bytes,string,1.0,1048576,8192,0,0.0011853339965455234,2.9384889602661133,843.644072400141 -kiru,bytes,string,1.0,1048576,16384,0,0.0007098749920260161,2.9384889602661133,1408.6987303862525 -kiru,bytes,string,1.0,1048576,16384,0,0.0006961670005694032,2.9384889602661133,1436.4369457071195 -kiru,bytes,string,1.0,1048576,16384,0,0.0007075410103425384,2.9384889602661133,1413.3456370477732 -kiru,bytes,file,1.0,1048576,1024,0,0.001501749997260049,3.0001325607299805,665.8897964538075 -kiru,bytes,file,1.0,1048576,1024,0,0.0013604999985545874,3.0001325607299805,735.0238890572678 -kiru,bytes,file,1.0,1048576,1024,0,0.0012536669964902103,3.0001325607299805,797.659986902119 -kiru,bytes,file,1.0,1048576,4096,0,0.0012760419922415167,3.0001325607299805,783.6732694379307 -kiru,bytes,file,1.0,1048576,4096,0,0.0015595830045640469,3.0001325607299805,641.1970360497304 -kiru,bytes,file,1.0,1048576,4096,0,0.001421791996108368,3.0001325607299805,703.337761597429 -kiru,bytes,file,1.0,1048576,8192,0,0.0012883750023320317,3.0001325607299805,776.1715325040794 -kiru,bytes,file,1.0,1048576,8192,0,0.0012465000036172569,3.0001325607299805,802.2462872828472 -kiru,bytes,file,1.0,1048576,8192,0,0.001261833996977657,3.0001325607299805,792.4972717450937 -kiru,bytes,file,1.0,1048576,16384,0,0.001524958002846688,3.0001325607299805,655.7557638526883 -kiru,bytes,file,1.0,1048576,16384,0,0.0013369590160436928,3.0001325607299805,747.9660842253666 -kiru,bytes,file,1.0,1048576,16384,0,0.001348958001472056,3.0001325607299805,741.3129236853526 -langchain,chars,string,1.0,1048576,1024,0,2.982076207990758,18.6186466217041,0.3353368358999024 -langchain,chars,string,1.0,1048576,1024,0,3.0477626659849193,18.61863899230957,0.3281095379114235 -langchain,chars,string,1.0,1048576,1024,0,2.9746567910187878,18.61863136291504,0.3361732361929091 -langchain,chars,string,1.0,1048576,4096,0,2.956034875009209,18.618738174438477,0.3382910020629864 -langchain,chars,string,1.0,1048576,4096,0,2.9766609170183074,18.618616104125977,0.33594689750611245 -langchain,chars,string,1.0,1048576,4096,0,2.971928166982252,18.618608474731445,0.3364818877891714 -langchain,chars,string,1.0,1048576,8192,0,2.9642003329936415,18.618730545043945,0.33735911465540785 -langchain,chars,string,1.0,1048576,8192,0,2.9704757090075873,18.61858558654785,0.3366464155783628 -langchain,chars,string,1.0,1048576,8192,0,2.975909250002587,18.61857795715332,0.33603175231204735 -langchain,chars,string,1.0,1048576,16384,0,2.9775873749749735,18.618684768676758,0.33584236969986647 -langchain,chars,string,1.0,1048576,16384,0,2.987198332994012,18.618562698364258,0.33476183651914365 -langchain,chars,string,1.0,1048576,16384,0,2.9781107919989154,18.618555068969727,0.3357833438187159 -langchain,chars,file,1.0,1048576,1024,0,3.0039237909950316,20.532188415527344,0.33289792603851515 -langchain,chars,file,1.0,1048576,1024,0,2.9778876659984235,20.532440185546875,0.3358085032615631 -langchain,chars,file,1.0,1048576,1024,0,2.982134624995524,20.53216552734375,0.3353302669900427 -langchain,chars,file,1.0,1048576,4096,0,2.969576000003144,20.53215789794922,0.3367484112206393 -langchain,chars,file,1.0,1048576,4096,0,2.974966540990863,20.532325744628906,0.3361382342360506 -langchain,chars,file,1.0,1048576,4096,0,2.986541207996197,20.532142639160156,0.33483549375531446 -langchain,chars,file,1.0,1048576,8192,0,2.9875509589910507,20.532135009765625,0.33472232397927626 -langchain,chars,file,1.0,1048576,8192,0,2.970236332999775,20.532386779785156,0.33667354644135505 -langchain,chars,file,1.0,1048576,8192,0,2.984440166997956,20.532119750976562,0.33507121739548845 -langchain,chars,file,1.0,1048576,16384,0,2.986593083012849,20.53211212158203,0.3348296778988079 -langchain,chars,file,1.0,1048576,16384,0,2.9700638329959475,20.53211212158203,0.3366931002931628 -langchain,chars,file,1.0,1048576,16384,0,2.9410979580134153,20.53228759765625,0.34000907629593435 +kiru-native,chars,string,1.0,1048576,1024,0,0.003684417,,271.41336064837395 +kiru-native,chars,string,1.0,1048576,1024,0,0.00597975,,167.23107153309084 +kiru-native,chars,string,1.0,1048576,1024,0,0.004696834,,212.90937682702858 +kiru-native,chars,string,1.0,1048576,4096,0,0.00445975,,224.22781546050786 +kiru-native,chars,string,1.0,1048576,4096,0,0.003947834,,253.30345703492094 +kiru-native,chars,string,1.0,1048576,4096,0,0.004050375,,246.89071999506217 +kiru-native,chars,string,1.0,1048576,8192,0,0.004209167,,237.57669866745607 +kiru-native,chars,string,1.0,1048576,8192,0,0.0038425,,260.24723487312946 +kiru-native,chars,string,1.0,1048576,8192,0,0.004133333,,241.93550338189542 +kiru-native,chars,string,1.0,1048576,16384,0,0.003924333,,254.82037329655765 +kiru-native,chars,string,1.0,1048576,16384,0,0.00383375,,260.84121291164007 +kiru-native,chars,string,1.0,1048576,16384,0,0.003947875,,253.30082639394612 +kiru-native,chars,file,1.0,1048576,1024,0,0.004324834,,231.22274750892174 +kiru-native,chars,file,1.0,1048576,1024,0,0.004240709,,235.80962523012073 +kiru-native,chars,file,1.0,1048576,1024,0,0.004157208,,240.54605879715425 +kiru-native,chars,file,1.0,1048576,4096,0,0.004203333,,237.90644233992404 +kiru-native,chars,file,1.0,1048576,4096,0,0.00417425,,239.56399353177218 +kiru-native,chars,file,1.0,1048576,4096,0,0.004383,,228.15423226100845 +kiru-native,chars,file,1.0,1048576,8192,0,0.004290917,,233.05041789435685 +kiru-native,chars,file,1.0,1048576,8192,0,0.003996292,,250.23196503158425 +kiru-native,chars,file,1.0,1048576,8192,0,0.003969583,,251.9156294250555 +kiru-native,chars,file,1.0,1048576,16384,0,0.004203541,,237.89467023159762 +kiru-native,chars,file,1.0,1048576,16384,0,0.004473708,,223.52822312050765 +kiru-native,chars,file,1.0,1048576,16384,0,0.004258958,,234.7992161462968 +kiru-native,bytes,string,1.0,1048576,1024,0,0.000231291,,4323.557769217134 +kiru-native,bytes,string,1.0,1048576,1024,0,0.000220083,,4543.740316153451 +kiru-native,bytes,string,1.0,1048576,1024,0,0.000212167,,4713.268321652284 +kiru-native,bytes,string,1.0,1048576,4096,0,0.000236708,,4224.614292715075 +kiru-native,bytes,string,1.0,1048576,4096,0,0.000211,,4739.336492890995 +kiru-native,bytes,string,1.0,1048576,4096,0,0.00021,,4761.9047619047615 +kiru-native,bytes,string,1.0,1048576,8192,0,0.000236958,,4220.157158652588 +kiru-native,bytes,string,1.0,1048576,8192,0,0.000240791,,4152.979139585782 +kiru-native,bytes,string,1.0,1048576,8192,0,0.00022025,,4540.295119182747 +kiru-native,bytes,string,1.0,1048576,16384,0,0.000227625,,4393.190554640308 +kiru-native,bytes,string,1.0,1048576,16384,0,0.000240417,,4159.43964029166 +kiru-native,bytes,string,1.0,1048576,16384,0,0.000235709,,4242.519377707258 +kiru-native,bytes,file,1.0,1048576,1024,0,0.000929208,,1076.1853105009857 +kiru-native,bytes,file,1.0,1048576,1024,0,0.0009865,,1013.6847440446022 +kiru-native,bytes,file,1.0,1048576,1024,0,0.000914583,,1093.3944759524286 +kiru-native,bytes,file,1.0,1048576,4096,0,0.000980083,,1020.3217482600962 +kiru-native,bytes,file,1.0,1048576,4096,0,0.000928166,,1077.3934834932545 +kiru-native,bytes,file,1.0,1048576,4096,0,0.000933875,,1070.8071208673537 +kiru-native,bytes,file,1.0,1048576,8192,0,0.00111375,,897.8675645342313 +kiru-native,bytes,file,1.0,1048576,8192,0,0.001080292,,925.6756506574148 +kiru-native,bytes,file,1.0,1048576,8192,0,0.000947208,,1055.7343265681877 +kiru-native,bytes,file,1.0,1048576,16384,0,0.001071584,,933.1979574163108 +kiru-native,bytes,file,1.0,1048576,16384,0,0.000940041,,1063.783388171367 +kiru-native,bytes,file,1.0,1048576,16384,0,0.000984084,,1016.1734160904963 +kiru,chars,string,1.0,1048576,1024,0,0.0055376250529661775,3.9179420471191406,180.5828293600989 +kiru,chars,string,1.0,1048576,1024,0,0.005144166061654687,2.9384889602661133,194.39496859445032 +kiru,chars,string,1.0,1048576,1024,0,0.0045201670145615935,2.9384889602661133,221.2307635488971 +kiru,chars,string,1.0,1048576,4096,0,0.0036226249067112803,2.9384889602661133,276.0429317833592 +kiru,chars,string,1.0,1048576,4096,0,0.0035521669778972864,2.9384889602661133,281.518297484977 +kiru,chars,string,1.0,1048576,4096,0,0.003840374993160367,2.9384889602661133,260.3912382985986 +kiru,chars,string,1.0,1048576,8192,0,0.0037297080270946026,2.9384889602661133,268.1175021571294 +kiru,chars,string,1.0,1048576,8192,0,0.003694583079777658,2.9384889602661133,270.6665348719619 +kiru,chars,string,1.0,1048576,8192,0,0.0045389579609036446,2.9384889602661133,220.31488474083898 +kiru,chars,string,1.0,1048576,16384,0,0.0038777090376242995,2.9384889602661133,257.8842275934802 +kiru,chars,string,1.0,1048576,16384,0,0.004185041994787753,2.9384889602661133,238.94622831633396 +kiru,chars,string,1.0,1048576,16384,0,0.004049375071190298,2.9384889602661133,246.95168573407895 +kiru,chars,file,1.0,1048576,1024,0,0.004658040939830244,3.0001325607299805,214.68252703602124 +kiru,chars,file,1.0,1048576,1024,0,0.004277874948456883,3.0001325607299805,233.76092383455958 +kiru,chars,file,1.0,1048576,1024,0,0.004122416954487562,3.0001325607299805,242.57614187022602 +kiru,chars,file,1.0,1048576,4096,0,0.004368667025119066,3.0001325607299805,228.9027738324244 +kiru,chars,file,1.0,1048576,4096,0,0.004091707989573479,3.0001325607299805,244.3967171040083 +kiru,chars,file,1.0,1048576,4096,0,0.004430332919582725,3.0001325607299805,225.71667144467915 +kiru,chars,file,1.0,1048576,8192,0,0.004384875064715743,3.0001325607299805,228.05666871715687 +kiru,chars,file,1.0,1048576,8192,0,0.003931166022084653,3.0001325607299805,254.37745299541209 +kiru,chars,file,1.0,1048576,8192,0,0.003937500063329935,3.0001325607299805,253.9682498834812 +kiru,chars,file,1.0,1048576,16384,0,0.004256666987203062,3.0001325607299805,234.92558920073577 +kiru,chars,file,1.0,1048576,16384,0,0.004446499980986118,3.0001325607299805,224.8959865683449 +kiru,chars,file,1.0,1048576,16384,0,0.003948375000618398,3.0001325607299805,253.2687497624666 +kiru,bytes,string,1.0,1048576,1024,0,0.0007589589804410934,2.9384889602661133,1317.5942650007485 +kiru,bytes,string,1.0,1048576,1024,0,0.001088665914721787,2.9384889602661133,918.5554415521075 +kiru,bytes,string,1.0,1048576,1024,0,0.0008405830012634397,2.9384889602661133,1189.6505145797005 +kiru,bytes,string,1.0,1048576,4096,0,0.0007534590549767017,2.9384889602661133,1327.2121336851167 +kiru,bytes,string,1.0,1048576,4096,0,0.0007500419160351157,2.9384889602661133,1333.258820101971 +kiru,bytes,string,1.0,1048576,4096,0,0.0010088329436257482,2.9384889602661133,991.2443941471593 +kiru,bytes,string,1.0,1048576,8192,0,0.0008982500294223428,2.9384889602661133,1113.2757776173876 +kiru,bytes,string,1.0,1048576,8192,0,0.0011030420428141952,2.9384889602661133,906.5837576314829 +kiru,bytes,string,1.0,1048576,8192,0,0.0007244580192491412,2.9384889602661133,1380.3422329929374 +kiru,bytes,string,1.0,1048576,16384,0,0.0007070000283420086,2.9384889602661133,1414.4271003002757 +kiru,bytes,string,1.0,1048576,16384,0,0.0007221660343930125,2.9384889602661133,1384.7231140419246 +kiru,bytes,string,1.0,1048576,16384,0,0.0007148330332711339,2.9384889602661133,1398.9280761465639 +kiru,bytes,file,1.0,1048576,1024,0,0.0013864999637007713,3.0001325607299805,721.2405526004153 +kiru,bytes,file,1.0,1048576,1024,0,0.0011796659091487527,3.0001325607299805,847.6976339187426 +kiru,bytes,file,1.0,1048576,1024,0,0.0012188330292701721,3.0001325607299805,820.4569255879063 +kiru,bytes,file,1.0,1048576,4096,0,0.0015735409688204527,3.0001325607299805,635.5093510845246 +kiru,bytes,file,1.0,1048576,4096,0,0.0013527090195566416,3.0001325607299805,739.2572870754983 +kiru,bytes,file,1.0,1048576,4096,0,0.0011856249766424298,3.0001325607299805,843.4370224148778 +kiru,bytes,file,1.0,1048576,8192,0,0.0012032910017296672,3.0001325607299805,831.0541660849727 +kiru,bytes,file,1.0,1048576,8192,0,0.001582166994921863,3.0001325607299805,632.044533358115 +kiru,bytes,file,1.0,1048576,8192,0,0.0012327500153332949,3.0001325607299805,811.1944737876422 +kiru,bytes,file,1.0,1048576,16384,0,0.0012645840179175138,3.0001325607299805,790.7738717485736 +kiru,bytes,file,1.0,1048576,16384,0,0.0014995420351624489,3.0001325607299805,666.870268756199 +kiru,bytes,file,1.0,1048576,16384,0,0.0014811670407652855,3.0001325607299805,675.1432974658433 +langchain,chars,string,1.0,1048576,1024,0,2.9945295839570463,18.6186466217041,0.3339422677129057 +langchain,chars,string,1.0,1048576,1024,0,2.971040125004947,18.61863899230957,0.3365824620084304 +langchain,chars,string,1.0,1048576,1024,0,2.9735860410146415,18.61863136291504,0.3362942878420232 +langchain,chars,string,1.0,1048576,4096,0,2.9889144169865176,18.618738174438477,0.33456963314734844 +langchain,chars,string,1.0,1048576,4096,0,2.9844254580093548,18.618616104125977,0.335072868821797 +langchain,chars,string,1.0,1048576,4096,0,3.073730916948989,18.618608474731445,0.32533752206019656 +langchain,chars,string,1.0,1048576,8192,0,2.9703582919901237,18.618757247924805,0.3366597230699753 +langchain,chars,string,1.0,1048576,8192,0,2.9846570000518113,18.61858558654785,0.33504687472719336 +langchain,chars,string,1.0,1048576,8192,0,2.969381083967164,18.61857795715332,0.33677051605110114 +langchain,chars,string,1.0,1048576,16384,0,3.026977875037119,18.618684768676758,0.3303625071880439 +langchain,chars,string,1.0,1048576,16384,0,3.151922957971692,18.618562698364258,0.31726663796488047 +langchain,chars,string,1.0,1048576,16384,0,3.137174875009805,18.618555068969727,0.31875813107067374 +langchain,chars,file,1.0,1048576,1024,0,3.1553197079338133,20.532474517822266,0.31692509557290677 +langchain,chars,file,1.0,1048576,1024,0,3.064274000003934,20.532180786132812,0.3263415738927773 +langchain,chars,file,1.0,1048576,1024,0,3.066951583023183,20.53216552734375,0.3260566634098185 +langchain,chars,file,1.0,1048576,4096,0,3.062590082990937,20.53215789794922,0.3265210076770693 +langchain,chars,file,1.0,1048576,4096,0,3.0781566250370815,20.532325744628906,0.32486975869460616 +langchain,chars,file,1.0,1048576,4096,0,3.057069084024988,20.532142639160156,0.3271106973753381 +langchain,chars,file,1.0,1048576,8192,0,3.0551470420323312,20.532135009765625,0.3273164879601947 +langchain,chars,file,1.0,1048576,8192,0,3.067288332968019,20.532302856445312,0.32602086646101636 +langchain,chars,file,1.0,1048576,8192,0,3.0267302920110524,20.532119750976562,0.33038953045782266 +langchain,chars,file,1.0,1048576,16384,0,3.0345783749362454,20.53211212158203,0.32953507092101697 +langchain,chars,file,1.0,1048576,16384,0,3.0210590419592336,20.53228759765625,0.33100975059112864 +langchain,chars,file,1.0,1048576,16384,0,3.0452704170020297,20.53211212158203,0.3283780627220842 diff --git a/kiru-py/benchmark_throughput.png b/kiru-py/benchmark_throughput.png index 8f1597b..150df6c 100644 Binary files a/kiru-py/benchmark_throughput.png and b/kiru-py/benchmark_throughput.png differ diff --git a/kiru-py/src/lib.rs b/kiru-py/src/lib.rs index d7b6993..bb6d6f9 100644 --- a/kiru-py/src/lib.rs +++ b/kiru-py/src/lib.rs @@ -1,5 +1,8 @@ use ::kiru as kiru_core; -use kiru_core::{ChunkerEnum, ChunkerWithStrategy, HigherOrderSource, Source, SourceGenerator}; +use kiru_core::{ + BytesChunker, CharactersChunker, ChunkerBuilder, ChunkerWithStrategy, HigherOrderSource, + Source, SourceGenerator, +}; use pyo3::prelude::*; // ============================================================================ @@ -51,10 +54,16 @@ fn parse_source_strings(source_strings: Vec) -> PyResult), + Chars(ChunkerWithStrategy), +} + /// A wrapper around a chunker strategy, providing methods to chunk various sources. #[pyclass] pub struct ChunkerBuilderWrapper { - inner: ChunkerWithStrategy, + inner: PyChunker, } /// An iterator over chunks produced from one or more sources. @@ -82,11 +91,11 @@ impl Chunker { /// ValueError: If chunk_size is 0 or overlap is not less than chunk_size. #[staticmethod] fn by_bytes(chunk_size: usize, overlap: usize) -> PyResult { - let chunker = kiru_core::ChunkerBuilder::by_bytes(ChunkerEnum::Bytes { - chunk_size, - overlap, - }); - Ok(ChunkerBuilderWrapper { inner: chunker }) + let chunker = ChunkerBuilder::by_bytes(chunk_size, overlap) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(ChunkerBuilderWrapper { + inner: PyChunker::Bytes(chunker), + }) } /// Create a characters-based chunker with the specified chunk size and overlap. @@ -102,11 +111,80 @@ impl Chunker { /// ValueError: If chunk_size is 0 or overlap is not less than chunk_size. #[staticmethod] fn by_characters(chunk_size: usize, overlap: usize) -> PyResult { - let chunker = kiru_core::ChunkerBuilder::by_characters(ChunkerEnum::Characters { - chunk_size, - overlap, - }); - Ok(ChunkerBuilderWrapper { inner: chunker }) + let chunker = ChunkerBuilder::by_characters(chunk_size, overlap) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(ChunkerBuilderWrapper { + inner: PyChunker::Chars(chunker), + }) + } +} + +// Small helper to de-duplicate single-source handling. +impl ChunkerBuilderWrapper { + fn on_source_internal(&self, source: Source) -> PyResult { + match &self.inner { + PyChunker::Bytes(b) => { + let inner_iter = b + .on_source(source) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(ChunkerIterator { + inner: Box::new(inner_iter), + }) + } + PyChunker::Chars(c) => { + let inner_iter = c + .on_source(source) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(ChunkerIterator { + inner: Box::new(inner_iter), + }) + } + } + } + + fn on_sources_internal(&self, sources: Vec) -> PyResult { + match &self.inner { + PyChunker::Bytes(b) => { + let inner_iter = b + .on_sources(sources) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(ChunkerIterator { + inner: Box::new(inner_iter), + }) + } + PyChunker::Chars(c) => { + let inner_iter = c + .on_sources(sources) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(ChunkerIterator { + inner: Box::new(inner_iter), + }) + } + } + } + fn on_sources_par_internal( + &self, + sources: Vec, + channel_size: usize, + ) -> PyResult { + match &self.inner { + PyChunker::Bytes(b) => { + let inner_iter = b + .on_sources_par_stream(sources, channel_size) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(ChunkerIterator { + inner: Box::new(inner_iter), + }) + } + PyChunker::Chars(c) => { + let inner_iter = c + .on_sources_par_stream(sources, channel_size) + .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; + Ok(ChunkerIterator { + inner: Box::new(inner_iter), + }) + } + } } } @@ -124,11 +202,7 @@ impl ChunkerBuilderWrapper { /// ValueError: If the input cannot be processed. fn on_string(&self, text: String) -> PyResult { let source = Source::Text(text); - let iterator = self - .inner - .on_source(source) - .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; - Ok(ChunkerIterator { inner: iterator }) + self.on_source_internal(source) } /// Chunk a single file from a local path. @@ -143,11 +217,7 @@ impl ChunkerBuilderWrapper { /// ValueError: If the file cannot be read (e.g., does not exist). fn on_file(&self, path: String) -> PyResult { let source = Source::File(path); - let iterator = self - .inner - .on_source(source) - .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; - Ok(ChunkerIterator { inner: iterator }) + self.on_source_internal(source) } /// Chunk content from an HTTP/HTTPS URL. @@ -162,11 +232,7 @@ impl ChunkerBuilderWrapper { /// ValueError: If the URL cannot be fetched or content cannot be processed. fn on_http(&self, url: String) -> PyResult { let source = Source::Http(url); - let iterator = self - .inner - .on_source(source) - .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; - Ok(ChunkerIterator { inner: iterator }) + self.on_source_internal(source) } /// Chunk multiple sources specified as strings with prefixes. @@ -200,12 +266,7 @@ impl ChunkerBuilderWrapper { let sources = HigherOrderSource::into_flattened_sources(higher_order_sources) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; - let iterator = self - .inner - .on_sources(sources) - .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; - - Ok(ChunkerIterator { inner: iterator }) + self.on_sources_internal(sources) } /// Chunk multiple sources in parallel, specified as strings with prefixes. @@ -244,14 +305,7 @@ impl ChunkerBuilderWrapper { let sources = HigherOrderSource::into_flattened_sources(higher_order_sources) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; - let iterator = self - .inner - .on_sources_par_stream(sources, channel_size.unwrap_or(1000)) - .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?; - - Ok(ChunkerIterator { - inner: Box::new(iterator), - }) + self.on_sources_par_internal(sources, channel_size.unwrap_or(1000)) } }