From beef312810e0347c22b3f2f0a1e85a113b788f8a Mon Sep 17 00:00:00 2001 From: Zhiyong Fang Date: Thu, 11 Sep 2025 22:05:34 -0700 Subject: [PATCH 1/8] clean up old profiler --- .../expander_no_oversubscribe.rs | 1 - .../expander_no_oversubscribe/profiler.rs | 78 ------------- .../expander_no_oversubscribe/prove_impl.rs | 30 ----- .../expander_no_oversubscribe/server_bin.rs | 108 ++++++++++++++++++ .../expander_no_oversubscribe/server_fn.rs | 41 +------ 5 files changed, 111 insertions(+), 147 deletions(-) delete mode 100644 expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/profiler.rs diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe.rs index 65954def..0ce09b0b 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe.rs @@ -1,4 +1,3 @@ pub mod api_no_oversubscribe; -pub mod profiler; pub mod prove_impl; pub mod server_fn; diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/profiler.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/profiler.rs deleted file mode 100644 index ed9421cc..00000000 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/profiler.rs +++ /dev/null @@ -1,78 +0,0 @@ -#[cfg(feature = "zkcuda_profile")] -mod profiler_enabled { - use std::collections::HashMap; - - use arith::Fr; - use halo2curves::ff::PrimeField; - - #[derive(Clone, Debug, Default)] - pub struct NBytesProfiler { - pub bytes_stats: HashMap, - } - - impl NBytesProfiler { - pub fn new() -> Self { - NBytesProfiler { - bytes_stats: HashMap::new(), - } - } - - pub fn add_bytes(&mut self, n_bytes: usize) { - *self.bytes_stats.entry(n_bytes).or_insert(0) += 1; - } - - pub fn add_fr(&mut self, fr: Fr) { - let le_bytes = fr.to_repr(); - let be_leading_zeros_bytes = le_bytes.into_iter().rev().take_while(|&b| b == 0).count(); - let n_bytes = le_bytes.len() - be_leading_zeros_bytes; - self.add_bytes(n_bytes); - } - - pub fn print_stats(&self) { - for (bytes, count) in &self.bytes_stats { - println!("{bytes} bytes: {count}"); - } - } - } -} - -#[cfg(not(feature = "zkcuda_profile"))] -mod profiler_disabled { - use arith::Fr; - - #[derive(Clone, Debug, Default)] - pub struct NBytesProfiler; - - impl NBytesProfiler { - pub fn new() -> Self { - NBytesProfiler - } - - pub fn add_bytes(&mut self, _n_bytes: usize) {} - - pub fn add_fr(&mut self, _fr: Fr) {} - - pub fn print_stats(&self) {} - } -} - -#[cfg(not(feature = "zkcuda_profile"))] -pub use profiler_disabled::NBytesProfiler; -#[cfg(feature = "zkcuda_profile")] -pub use profiler_enabled::NBytesProfiler; - -#[cfg(feature = "zkcuda_profile")] -mod test { - #![allow(unused_imports)] - use super::NBytesProfiler; - use arith::Fr; - - #[test] - fn test_n_bytes_profiler() { - let mut profiler = NBytesProfiler::new(); - profiler.add_bytes(32); - profiler.add_bytes(64); - profiler.add_fr(Fr::from(256u64)); - profiler.print_stats(); - } -} diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs index bc980372..cc422ae3 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs @@ -20,7 +20,6 @@ use crate::{ }, structs::{ExpanderProof, ExpanderProverSetup}, }, - expander_no_oversubscribe::profiler::NBytesProfiler, expander_parallelized::{ prove_impl::partition_single_gkr_claim_and_open_pcs_mpi, server_ctrl::generate_local_mpi_config, @@ -38,7 +37,6 @@ pub fn mpi_prove_no_oversubscribe_impl( prover_setup: &ExpanderProverSetup, GetPCS>, computation_graph: &ComputationGraph, values: &[impl AsRef<[SIMDField]>], - n_bytes_profiler: &mut NBytesProfiler, ) -> Option>> where ::FieldConfig: FieldEngine, @@ -91,7 +89,6 @@ where &commitment_values, next_power_of_two(template.parallel_count()), template.is_broadcast(), - n_bytes_profiler, ); single_kernel_gkr_timer.stop(); @@ -201,7 +198,6 @@ pub fn prove_kernel_gkr_no_oversubscribe( commitments_values: &[&[F::SimdCircuitField]], parallel_count: usize, is_broadcast: &[bool], - n_bytes_profiler: &mut NBytesProfiler, ) -> Option<(T, ExpanderDualVarChallenge)> where F: FieldEngine, @@ -223,7 +219,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 2 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -231,7 +226,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 4 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -239,7 +233,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 8 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -247,7 +240,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 16 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -255,7 +247,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 32 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -263,7 +254,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 64 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -271,7 +261,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 128 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -279,7 +268,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 256 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -287,7 +275,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 512 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -295,7 +282,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 1024 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -303,7 +289,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), 2048 => prove_kernel_gkr_internal::, T, ECCConfig>( &local_mpi_config, @@ -311,7 +296,6 @@ where commitments_values, parallel_count, is_broadcast, - n_bytes_profiler, ), _ => { panic!("Unsupported parallel count: {parallel_count}"); @@ -325,7 +309,6 @@ pub fn prove_kernel_gkr_internal( commitments_values: &[&[FBasic::SimdCircuitField]], parallel_count: usize, is_broadcast: &[bool], - n_bytes_profiler: &mut NBytesProfiler, ) -> Option<(T, ExpanderDualVarChallenge)> where FBasic: FieldEngine, @@ -357,7 +340,6 @@ where kernel.layered_circuit_input(), &mut transcript, mpi_config, - n_bytes_profiler, ); Some((transcript, challenge)) @@ -389,7 +371,6 @@ pub fn prove_gkr_with_local_vals_multi_copies( partition_info: &[LayeredCircuitInputVec], transcript: &mut T, mpi_config: &MPIConfig, - _n_bytes_profiler: &mut NBytesProfiler, ) -> ExpanderDualVarChallenge where FBasic: FieldEngine, @@ -423,17 +404,6 @@ where expander_circuit.fill_rnd_coefs(transcript); expander_circuit.evaluate(); - #[cfg(feature = "zkcuda_profile")] - { - expander_circuit.layers.iter().for_each(|layer| { - layer.input_vals.iter().for_each(|val| { - val.unpack().iter().for_each(|fr| { - _n_bytes_profiler.add_fr(*fr); - }) - }); - }); - } - let (claimed_v, challenge) = gkr::gkr_prove(expander_circuit, prover_scratch, transcript, mpi_config); assert_eq!(claimed_v, FBasic::ChallengeField::from(0u32)); diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs index e6eb38d6..66df9c99 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs @@ -11,6 +11,114 @@ use expander_compiler::zkcuda::proving_system::{ }; use gkr_engine::{FiatShamirHashType, PolynomialCommitmentType}; +use std::alloc::{GlobalAlloc, Layout, System}; +use std::sync::atomic::{AtomicUsize, Ordering}; + +struct ProfilingAllocator { + threshold: usize, + allocated: AtomicUsize, + deallocated: AtomicUsize, + peak: AtomicUsize, + allocation_count: AtomicUsize, +} + +impl ProfilingAllocator { + const fn new(threshold: usize) -> Self { + ProfilingAllocator { + threshold, + allocated: AtomicUsize::new(0), + deallocated: AtomicUsize::new(0), + peak: AtomicUsize::new(0), + allocation_count: AtomicUsize::new(0), + } + } + + fn current_usage(&self) -> usize { + self.allocated + .load(Ordering::Relaxed) + .saturating_sub(self.deallocated.load(Ordering::Relaxed)) + } + + fn print_stats(&self) { + eprintln!("\n=== Memory Statistics ==="); + eprintln!("Current usage: {} bytes", self.current_usage()); + eprintln!("Peak usage: {} bytes", self.peak.load(Ordering::Relaxed)); + eprintln!( + "Total allocated: {} bytes", + self.allocated.load(Ordering::Relaxed) + ); + eprintln!( + "Total deallocated: {} bytes", + self.deallocated.load(Ordering::Relaxed) + ); + eprintln!( + "Allocation count: {}", + self.allocation_count.load(Ordering::Relaxed) + ); + } +} + +unsafe impl GlobalAlloc for ProfilingAllocator { + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + let size = layout.size(); + let ptr = System.alloc(layout); + + if !ptr.is_null() { + self.allocated.fetch_add(size, Ordering::Relaxed); + self.allocation_count.fetch_add(1, Ordering::Relaxed); + + let current = self.current_usage(); + let mut peak = self.peak.load(Ordering::Relaxed); + while current > peak { + match self.peak.compare_exchange_weak( + peak, + current, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => break, + Err(x) => peak = x, + } + } + + if size >= self.threshold { + eprintln!( + "[ALLOC] {} bytes | Total: {} bytes", + size, + self.current_usage() + ); + } + } + + ptr + } + + unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { + let size = layout.size(); + self.deallocated.fetch_add(size, Ordering::Relaxed); + + if size >= self.threshold { + eprintln!( + "[DEALLOC] {} bytes | Total: {} bytes", + size, + self.current_usage() + ); + } + + System.dealloc(ptr, layout) + } +} + +#[global_allocator] +static ALLOCATOR: ProfilingAllocator = ProfilingAllocator::new(1024 * 16); + +// Optional: Print stats on program exit +impl Drop for ProfilingAllocator { + fn drop(&mut self) { + self.print_stats(); + } +} + async fn async_main() { let expander_exec_args = ExpanderExecArgs::parse(); diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_fn.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_fn.rs index 85c5955f..9e8f7a40 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_fn.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_fn.rs @@ -10,9 +10,7 @@ use crate::{ config::{GetFieldConfig, GetPCS, ZKCudaConfig}, structs::{ExpanderProverSetup, ExpanderVerifierSetup}, }, - expander_no_oversubscribe::{ - profiler::NBytesProfiler, prove_impl::mpi_prove_no_oversubscribe_impl, - }, + expander_no_oversubscribe::prove_impl::mpi_prove_no_oversubscribe_impl, expander_parallelized::{server_ctrl::SharedMemoryWINWrapper, server_fns::ServerFns}, CombinedProof, Expander, ExpanderNoOverSubscribe, ExpanderPCSDefered, ParallelizedExpander, @@ -58,44 +56,11 @@ where computation_graph: &ComputationGraph, values: &[impl AsRef<[SIMDField]>], ) -> Option>> { - let mut n_bytes_profiler = NBytesProfiler::new(); - - #[cfg(feature = "zkcuda_profile")] - { - use arith::SimdField; - use gkr_engine::MPIEngine; - - values.iter().for_each(|vals| { - vals.as_ref().iter().for_each(|fr| { - let fr_unpacked = fr.unpack(); - assert!(fr_unpacked.len() == 1); - n_bytes_profiler.add_fr(fr_unpacked[0]); - }); - }); - if global_mpi_config.is_root() { - println!("NBytesProfiler stats before proving:"); - n_bytes_profiler.print_stats(); - } - } - - let proof = mpi_prove_no_oversubscribe_impl::( + mpi_prove_no_oversubscribe_impl::( global_mpi_config, prover_setup, computation_graph, values, - &mut n_bytes_profiler, - ); - - #[cfg(feature = "zkcuda_profile")] - { - use gkr_engine::MPIEngine; - - if global_mpi_config.is_root() { - println!("NBytesProfiler stats after proving:"); - n_bytes_profiler.print_stats(); - } - } - - proof + ) } } From 0283f2fc91f698b4e41d856440e66841f5b7ce0c Mon Sep 17 00:00:00 2001 From: Zhiyong Fang Date: Thu, 11 Sep 2025 22:11:12 -0700 Subject: [PATCH 2/8] macros rules to simplify code --- .../expander_no_oversubscribe/prove_impl.rs | 100 ++++-------------- 1 file changed, 23 insertions(+), 77 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs index cc422ae3..0399151b 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs @@ -211,6 +211,18 @@ where let local_mpi_config = local_mpi_config.unwrap(); let local_world_size = local_mpi_config.world_size(); + macro_rules! call_prove_kernel_gkr_internal { + ($n:expr) => { + prove_kernel_gkr_internal::, T, ECCConfig>( + &local_mpi_config, + kernel, + commitments_values, + parallel_count, + is_broadcast, + ) + }; + } + let n_local_copies = parallel_count / local_world_size; match n_local_copies { 1 => prove_kernel_gkr_internal::( @@ -220,83 +232,17 @@ where parallel_count, is_broadcast, ), - 2 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 4 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 8 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 16 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 32 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 64 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 128 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 256 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 512 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 1024 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), - 2048 => prove_kernel_gkr_internal::, T, ECCConfig>( - &local_mpi_config, - kernel, - commitments_values, - parallel_count, - is_broadcast, - ), + 2 => call_prove_kernel_gkr_internal!(2), + 4 => call_prove_kernel_gkr_internal!(4), + 8 => call_prove_kernel_gkr_internal!(8), + 16 => call_prove_kernel_gkr_internal!(16), + 32 => call_prove_kernel_gkr_internal!(32), + 64 => call_prove_kernel_gkr_internal!(64), + 128 => call_prove_kernel_gkr_internal!(128), + 256 => call_prove_kernel_gkr_internal!(256), + 512 => call_prove_kernel_gkr_internal!(512), + 1024 => call_prove_kernel_gkr_internal!(1024), + 2048 => call_prove_kernel_gkr_internal!(2048), _ => { panic!("Unsupported parallel count: {parallel_count}"); } From 86d6e402c0061be2dd5428e9beb094bdc595890b Mon Sep 17 00:00:00 2001 From: Zhiyong Fang Date: Thu, 11 Sep 2025 22:16:00 -0700 Subject: [PATCH 3/8] adjust default threshold --- .../proving_system/expander_no_oversubscribe/server_bin.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs index 66df9c99..f11b8063 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs @@ -110,7 +110,7 @@ unsafe impl GlobalAlloc for ProfilingAllocator { } #[global_allocator] -static ALLOCATOR: ProfilingAllocator = ProfilingAllocator::new(1024 * 16); +static ALLOCATOR: ProfilingAllocator = ProfilingAllocator::new(1024 * 1024 * 16); // 16 MB threshold // Optional: Print stats on program exit impl Drop for ProfilingAllocator { From 4e5cedc4287811fe50ce57de2b043012c433d72f Mon Sep 17 00:00:00 2001 From: Zhiyong Fang Date: Thu, 11 Sep 2025 22:27:16 -0700 Subject: [PATCH 4/8] print memory stats --- .../proving_system/expander_no_oversubscribe/server_bin.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs index f11b8063..4cac64ff 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_bin.rs @@ -178,6 +178,8 @@ async fn async_main() { panic!("Combination of {field_type:?}, {pcs_type:?}, and {fiat_shamir_hash:?} not supported for no oversubscribe expander proving system."); } } + + ALLOCATOR.print_stats(); } pub fn main() { From f191d3b35333c229994a5550c3dcc35fc830f61e Mon Sep 17 00:00:00 2001 From: Zhiyong Fang Date: Fri, 12 Sep 2025 00:28:18 -0700 Subject: [PATCH 5/8] redirect each process's output --- .../expander_no_oversubscribe/server_fn.rs | 2 ++ .../expander_parallelized/cmd_utils.rs | 2 +- .../expander_parallelized/prove_impl.rs | 2 ++ .../expander_parallelized/server_fns.rs | 2 ++ .../expander_pcs_defered/prove_impl.rs | 16 ++++++++++++---- .../expander_pcs_defered/server_fns.rs | 2 ++ .../expander_pcs_defered/setup_impl.rs | 2 ++ 7 files changed, 23 insertions(+), 5 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_fn.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_fn.rs index 9e8f7a40..4f2f8522 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_fn.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/server_fn.rs @@ -30,6 +30,7 @@ where verifier_setup: &mut ExpanderVerifierSetup, GetPCS>, mpi_win: &mut Option, ) { + eprintln!("Entering setup_request_handler for ExpanderNoOverSubscribe"); match ZC::BATCH_PCS { true => ExpanderPCSDefered::::setup_request_handler( global_mpi_config, @@ -48,6 +49,7 @@ where mpi_win, ), } + eprintln!("Exiting setup_request_handler for ExpanderNoOverSubscribe"); } fn prove_request_handler( diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/cmd_utils.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/cmd_utils.rs index b8d6c830..2976b1f9 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/cmd_utils.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/cmd_utils.rs @@ -16,7 +16,7 @@ pub fn start_server( let batch_pcs_option = if batch_pcs { "--batch-pcs" } else { "" }; let cmd_str = format!( - "mpiexec -n {max_parallel_count} {overscribe} {binary} --field-type {field_name} --poly-commit {pcs_name} --port-number {port_number} {batch_pcs_option} --fiat-shamir-hash {fiat_shamir_hash}" + "mpiexec -n {max_parallel_count} --output-filename worker_log {overscribe} {binary} --field-type {field_name} --poly-commit {pcs_name} --port-number {port_number} {batch_pcs_option} --fiat-shamir-hash {fiat_shamir_hash}", ); exec_command(&cmd_str, false); } diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/prove_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/prove_impl.rs index 5605daf0..49e8b4c0 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/prove_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/prove_impl.rs @@ -134,6 +134,7 @@ where T: Transcript, ECCConfig: Config, { + eprintln!("Entering prove_kernel_gkr"); let local_mpi_config = generate_local_mpi_config(mpi_config, parallel_count); local_mpi_config.as_ref()?; @@ -162,6 +163,7 @@ where &local_mpi_config, ); + eprintln!("Exiting prove_kernel_gkr"); Some((transcript, challenge)) } diff --git a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/server_fns.rs b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/server_fns.rs index d3e326d9..3ced8a3a 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_parallelized/server_fns.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_parallelized/server_fns.rs @@ -148,6 +148,7 @@ pub fn read_circuit( C: GKREngine, ECCConfig: Config, { + eprintln!("Entering read_circuit"); let (cg, win) = if global_mpi_config.is_root() { let computation_graph_bytes = std::fs::read(setup_file).expect("Failed to read computation graph from file"); @@ -162,4 +163,5 @@ pub fn read_circuit( *computation_graph = cg; mpi_win.replace(SharedMemoryWINWrapper { win }); + eprintln!("Exiting read_circuit"); } diff --git a/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/prove_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/prove_impl.rs index 72545956..2cf308c2 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/prove_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/prove_impl.rs @@ -37,6 +37,7 @@ where C: GKREngine, ECCConfig: Config, { + eprintln!("Entering max_len_setup_commit_impl"); assert_eq!(prover_setup.p_keys.len(), 1); let len_to_commit = prover_setup.p_keys.keys().next().cloned().unwrap(); @@ -47,6 +48,7 @@ where local_commit_impl::(prover_setup.p_keys.get(&len_to_commit).unwrap(), vals); commitment.vals_len = actual_len; // Store the actual length in the commitment + eprintln!("Exiting max_len_setup_commit_impl"); (commitment, state) } @@ -106,6 +108,7 @@ where C: GKREngine, ECCConfig: Config, { + eprintln!("Entering mpi_prove_with_pcs_defered"); let commit_timer = Timer::new("Commit to all input", global_mpi_config.is_root()); let (commitments, _states) = if global_mpi_config.is_root() { let (commitments, states) = values @@ -144,6 +147,7 @@ where ); if global_mpi_config.is_root() { + eprintln!("Entering pcs claim extraction"); let (mut transcript, challenge) = gkr_end_state.unwrap(); assert!(challenge.challenge_y().is_none()); let challenge = challenge.challenge_x(); @@ -157,7 +161,7 @@ where vals_ref.extend(local_vals_ref); challenges.extend(local_challenges); - + eprintln!("Exiting pcs claim extraction"); Some(ExpanderProof { data: vec![transcript.finalize_and_get_proof()], }) @@ -168,22 +172,26 @@ where .collect::>(); prove_timer.stop(); - if global_mpi_config.is_root() { + let ret = if global_mpi_config.is_root() { + eprintln!("Entering pcs batch opening"); let mut proofs = proofs.into_iter().map(|p| p.unwrap()).collect::>(); let pcs_opening_timer = Timer::new("Batch PCS Opening for all kernels", true); let pcs_batch_opening = open_defered_pcs::(prover_setup, &vals_ref, &challenges); pcs_opening_timer.stop(); - proofs.push(pcs_batch_opening); + eprintln!("Exiting pcs batch opening"); Some(CombinedProof { commitments: commitments.unwrap(), proofs, }) } else { None - } + }; + + eprintln!("Exiting mpi_prove_with_pcs_defered"); + ret } pub fn extract_pcs_claims<'a, C: GKREngine>( diff --git a/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/server_fns.rs b/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/server_fns.rs index a34894e0..b4535f8a 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/server_fns.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/server_fns.rs @@ -31,6 +31,7 @@ where verifier_setup: &mut ExpanderVerifierSetup, mpi_win: &mut Option, ) { + eprintln!("Entering setup_request_handler for ExpanderPCSDefered"); let setup_file = if global_mpi_config.is_root() { let setup_file = setup_file.expect("Setup file path must be provided"); broadcast_string(global_mpi_config, Some(setup_file)) @@ -44,6 +45,7 @@ where (*prover_setup, *verifier_setup) = pcs_setup_max_length_only::(computation_graph); } + eprintln!("Exiting setup_request_handler for ExpanderPCSDefered"); } fn prove_request_handler( diff --git a/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/setup_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/setup_impl.rs index dba77cf4..f7a3feb9 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/setup_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_pcs_defered/setup_impl.rs @@ -23,6 +23,7 @@ where C: GKREngine, ECCConfig: Config, { + eprintln!("Entering pcs_setup_max_length_only"); let mut p_keys = HashMap::new(); let mut v_keys = HashMap::new(); let max_commitment_len = computation_graph @@ -40,6 +41,7 @@ where p_keys.insert(max_commitment_len, p_key); v_keys.insert(max_commitment_len, v_key); + eprintln!("Exiting pcs_setup_max_length_only"); ( ExpanderProverSetup { p_keys }, ExpanderVerifierSetup { v_keys }, From 7ed632575c776ca6520c5528a0a55759e793743d Mon Sep 17 00:00:00 2001 From: Zhiyong Fang Date: Sun, 14 Sep 2025 17:05:16 -0700 Subject: [PATCH 6/8] proper entering and exiting --- .../expander_no_oversubscribe/prove_impl.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs index 0399151b..65bf2da3 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs @@ -95,6 +95,7 @@ where match ZC::BATCH_PCS { true => { if global_mpi_config.is_root() { + eprintln!("Entering pcs claim extraction"); let (mut transcript, challenge) = gkr_end_state.unwrap(); assert!(challenge.challenge_y().is_none()); let challenge = challenge.challenge_x(); @@ -110,6 +111,7 @@ where vals_ref.extend(local_vals_ref); challenges.extend(local_challenges); + eprintln!("Exiting pcs claim extraction"); Some(ExpanderProof { data: vec![transcript.finalize_and_get_proof()], }) @@ -158,6 +160,7 @@ where match ZC::BATCH_PCS { true => { if global_mpi_config.is_root() { + eprintln!("Entering Batch PCS Opening"); let mut proofs = proofs.into_iter().map(|p| p.unwrap()).collect::>(); let pcs_opening_timer = Timer::new("Batch PCS Opening for all kernels", true); @@ -169,6 +172,7 @@ where pcs_opening_timer.stop(); proofs.push(pcs_batch_opening); + eprintln!("Exiting Batch PCS Opening"); Some(CombinedProof { commitments: commitments.unwrap(), proofs, @@ -204,6 +208,7 @@ where T: Transcript, ECCConfig: Config, { + eprint!("Entering prove_kernel_gkr_no_oversubscribe"); let local_mpi_config = generate_local_mpi_config(mpi_config, parallel_count); local_mpi_config.as_ref()?; @@ -224,7 +229,7 @@ where } let n_local_copies = parallel_count / local_world_size; - match n_local_copies { + let ret = match n_local_copies { 1 => prove_kernel_gkr_internal::( &local_mpi_config, kernel, @@ -246,7 +251,9 @@ where _ => { panic!("Unsupported parallel count: {parallel_count}"); } - } + }; + eprintln!("Exiting prove_kernel_gkr_no_oversubscribe"); + ret } pub fn prove_kernel_gkr_internal( @@ -263,6 +270,7 @@ where T: Transcript, ECCConfig: Config, { + eprint!("Entering prove_kernel_gkr_internal"); let world_rank = mpi_config.world_rank(); let world_size = mpi_config.world_size(); let n_copies = parallel_count / world_size; @@ -288,6 +296,7 @@ where mpi_config, ); + eprintln!("Exiting prove_kernel_gkr_internal"); Some((transcript, challenge)) } From b45c5fa3f4039ea7f2969c9985de9b7aa32f1dfd Mon Sep 17 00:00:00 2001 From: Zhiyong Fang Date: Sun, 14 Sep 2025 18:18:14 -0700 Subject: [PATCH 7/8] More detailed memory consumption --- .../expander_no_oversubscribe/prove_impl.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs index 65bf2da3..853a7b7c 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs @@ -208,7 +208,7 @@ where T: Transcript, ECCConfig: Config, { - eprint!("Entering prove_kernel_gkr_no_oversubscribe"); + eprintln!("Entering prove_kernel_gkr_no_oversubscribe"); let local_mpi_config = generate_local_mpi_config(mpi_config, parallel_count); local_mpi_config.as_ref()?; @@ -270,7 +270,7 @@ where T: Transcript, ECCConfig: Config, { - eprint!("Entering prove_kernel_gkr_internal"); + eprintln!("Entering prove_kernel_gkr_internal"); let world_rank = mpi_config.world_rank(); let world_size = mpi_config.world_size(); let n_copies = parallel_count / world_size; @@ -283,8 +283,10 @@ where parallel_count, ); + eprint!("Preparing expander circuit and prover scratchpad..."); let (mut expander_circuit, mut prover_scratch) = prepare_expander_circuit::(kernel, world_size); + eprintln!("Circuit and scratchpad prepared"); let mut transcript = T::new(); let challenge = prove_gkr_with_local_vals_multi_copies::( @@ -333,6 +335,7 @@ where FieldEngine, T: Transcript, { + eprintln!("Preparing input vals multiple copies"); let input_vals_multi_copies = local_commitment_values_multi_copies .iter() .map(|local_commitment_values| { @@ -343,7 +346,9 @@ where ) }) .collect::>(); + eprintln!("Input vals multiple copies prepared"); + eprintln!("Packing input vals multiple copies into single input vals"); let mut input_vals = vec![FMulti::SimdCircuitField::ZERO; 1 << expander_circuit.log_input_size()]; @@ -355,13 +360,18 @@ where *vals = FMulti::SimdCircuitField::pack(&vals_unpacked); } expander_circuit.layers[0].input_vals = input_vals; + eprintln!("Input vals multiple copies packed into single input vals"); + eprint!("Evaluating expander circuit..."); expander_circuit.fill_rnd_coefs(transcript); expander_circuit.evaluate(); + eprintln!("Expander circuit evaluated"); + eprint!("Proving GKR..."); let (claimed_v, challenge) = gkr::gkr_prove(expander_circuit, prover_scratch, transcript, mpi_config); assert_eq!(claimed_v, FBasic::ChallengeField::from(0u32)); + eprintln!("GKR proved"); let n_simd_vars_basic = FBasic::SimdCircuitField::PACK_SIZE.ilog2() as usize; From 75d8931b49c06f917af35d1c7e0a7161e7596e4e Mon Sep 17 00:00:00 2001 From: Zhiyong Fang Date: Sun, 14 Sep 2025 18:53:51 -0700 Subject: [PATCH 8/8] detailed circuit and scratch --- .../src/zkcuda/proving_system/expander/prove_impl.rs | 4 ++++ .../proving_system/expander_no_oversubscribe/prove_impl.rs | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/expander_compiler/src/zkcuda/proving_system/expander/prove_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander/prove_impl.rs index cc2afad4..e816026a 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander/prove_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander/prove_impl.rs @@ -28,12 +28,16 @@ where ECCConfig: Config, ECCConfig::FieldConfig: FieldEngine, { + eprintln!("Exporting to expander circuit..."); let mut expander_circuit = kernel.layered_circuit().export_to_expander().flatten(); expander_circuit.pre_process_gkr(); + eprintln!("Expander circuit exported, n_layers = {}", expander_circuit.layers.len()); + eprintln!("Preparing prover scratch pad..."); let (max_num_input_var, max_num_output_var) = super::utils::max_n_vars(&expander_circuit); let prover_scratch = ProverScratchPad::::new(max_num_input_var, max_num_output_var, mpi_world_size); + eprintln!("Prover scratch pad prepared"); (expander_circuit, prover_scratch) } diff --git a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs index 853a7b7c..1bbf1aef 100644 --- a/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs +++ b/expander_compiler/src/zkcuda/proving_system/expander_no_oversubscribe/prove_impl.rs @@ -283,7 +283,7 @@ where parallel_count, ); - eprint!("Preparing expander circuit and prover scratchpad..."); + eprintln!("Preparing expander circuit and prover scratchpad..."); let (mut expander_circuit, mut prover_scratch) = prepare_expander_circuit::(kernel, world_size); eprintln!("Circuit and scratchpad prepared"); @@ -362,12 +362,12 @@ where expander_circuit.layers[0].input_vals = input_vals; eprintln!("Input vals multiple copies packed into single input vals"); - eprint!("Evaluating expander circuit..."); + eprintln!("Evaluating expander circuit..."); expander_circuit.fill_rnd_coefs(transcript); expander_circuit.evaluate(); eprintln!("Expander circuit evaluated"); - eprint!("Proving GKR..."); + eprintln!("Proving GKR..."); let (claimed_v, challenge) = gkr::gkr_prove(expander_circuit, prover_scratch, transcript, mpi_config); assert_eq!(claimed_v, FBasic::ChallengeField::from(0u32));