From dc6239d62ffc5e5e6c17031f283c0d064afcb96c Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Mon, 24 Nov 2025 23:17:57 +0100 Subject: [PATCH 01/21] Add anonymizer tool --- Cargo.toml | 5 + REUSE.toml | 2 +- anonymizer_data/expected_detect_output.txt | 1 + src/anonymizer/README.md | 93 +++++ src/anonymizer/anonymizer.rs | 144 ++++++++ src/anonymizer/detect.rs | 385 +++++++++++++++++++++ src/anonymizer/pdf.rs | 293 ++++++++++++++++ src/anonymizer/replace.rs | 111 ++++++ 8 files changed, 1033 insertions(+), 1 deletion(-) create mode 100644 anonymizer_data/expected_detect_output.txt create mode 100644 src/anonymizer/README.md create mode 100644 src/anonymizer/anonymizer.rs create mode 100644 src/anonymizer/detect.rs create mode 100644 src/anonymizer/pdf.rs create mode 100644 src/anonymizer/replace.rs diff --git a/Cargo.toml b/Cargo.toml index 3b16afc..3d42011 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,9 @@ homepage = "https://github.com/jczaja/e-trade-tax-return-pl-helper" [[bin]] name = "etradeTaxReturnHelper" path = "src/main.rs" +[[bin]] +name = "etradeAnonymizer" +path = "src/anonymizer/anonymizer.rs" [[bin]] name = "gen_exchange_rates" @@ -45,3 +48,5 @@ polars = "0.35.4" csv = "1.3.0" serde_json = { version = "=1.0.133", optional = true } holidays = { version = "0.1.0", default-features = false, features = ["PL"] } + +flate2 = "1.1.5" diff --git a/REUSE.toml b/REUSE.toml index d0c53c7..8c8c496 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -21,7 +21,7 @@ path = [ "data/G&L_Expanded_polish.xlsx", "data/ecb_example_response.xml", "revolut_data/*.csv", - "revolut_data/*.tsv", + "anonymizer_data/*", ] SPDX-FileCopyrightText = "2025 RustInFinance" SPDX-License-Identifier = "LicenseRef-Private-Data" diff --git a/anonymizer_data/expected_detect_output.txt b/anonymizer_data/expected_detect_output.txt new file mode 100644 index 0000000..8498362 --- /dev/null +++ b/anonymizer_data/expected_detect_output.txt @@ -0,0 +1 @@ +replace "sample_statement.pdf" "out_sample_statement.pdf" "JAN KOWALSKI" "XXXXXXXXXXXX" "UL. SWIETOKRZYSKA 12" "XXXXXXXXXXXXXXXXXXXX" "WARSAW 00-916 POLAND" "XXXXXXXXXXXXXXXXXXXX" "012 - 345678 - 910 -" "XXXXXXXXXXXXXXXXXXXX" "012-345678-910" "XXXXXXXXXXXXXX" \ No newline at end of file diff --git a/src/anonymizer/README.md b/src/anonymizer/README.md new file mode 100644 index 0000000..fa4fef5 --- /dev/null +++ b/src/anonymizer/README.md @@ -0,0 +1,93 @@ +# etradeAnonymizer + +Minimal Rust tool for: +- Detecting personally identifiable information (PII) tokens in tightly structured PDF FlateDecode streams. +- Emitting a shell-friendly replace command line. +- Applying replacement strings while preserving original stream size (padding when needed). + +## Usage + +Detect mode (prints a replacement command suggestion): +``` +cargo run --bin etradeAnonymizer -- detect statement.pdf +``` + +Replace mode (apply explicit replacements): +``` +cargo run --bin etradeAnonymizer -- replace input.pdf output.pdf "JAN KOWALSKI" "XXXXX XXXXXXXX" +``` + +You can chain multiple pairs: +``` +cargo run --bin etradeAnonymizer -- replace in.pdf out.pdf "A" "X" "B" "Y" +``` + +## Build & Test +``` +cargo build --release --bin etradeAnonymizer +cargo test --bin etradeAnonymizer +``` + +Resulting binary: `target/release/etradeAnonymizer`. + +## Design Notes +- Strict PDF header (`%PDF-1.3\n`) enforcement; unsupported PDFs are skipped gracefully. This is for simplicity. +- Only FlateDecode streams with explicit `/Length` are processed as described below. +- Replacement recompresses; if no level fits original size, original compressed stream is kept. + +### Why Padding? (Architecture Note) +This tool avoids full PDF parsing and rebuilding. Instead, it modifies streams **in-place**. +- PDF files rely on a Cross-Reference (XREF) table that stores the byte offset of every object. +- If we changed the length of a stream object, all subsequent object offsets would shift, invalidating the XREF table. +- To avoid rebuilding the XREF table (which requires full PDF structure understanding), we ensure the modified stream is **exactly the same length** as the original. +- We achieve this by recompressing the modified text. If the new compressed data is smaller, we **pad** the remainder with null bytes (`0x00`). +- If the new compressed data is larger than the original (even at best compression), we cannot safely replace it without corrupting the file, so we fall back to keeping the original stream (and warn the user). + +### Exact PDF object pattern searched +The tool searches for PDF objects that exactly match the following pattern (both human-readable and via regex): + +Human-readable pattern: + +``` + obj +<< +/Length +/Filter [/FlateDecode] +>> +stream + +endstream +endobj +``` + +Regex used in code (PCRE-style): + +``` +(?s)\d+\s+\d+\s+obj\s*<<\s*/Length\s+(\d+)\s*/Filter\s*\[\s*/FlateDecode\s*\]\s*>>\s*stream\n +``` + +Only objects matching this pattern will be considered for detection and replacement for simplicity. + +## License +See `BSD-3-Clause` in `LICENSES/` directory. + +## Disclaimer + +Please note: this tool attempts to detect and replace common personally identifiable +information (PII) tokens in tightly structured PDF streams, but there is no guarantee +that all PII will be detected or removed. You must manually review the resulting +file and verify that sensitive information has been removed before sharing or +publishing. The maintainers make reasonable efforts to identify the following categories: + + - First & last name + - Mailing address (two lines) + - Account number + +These are the only PII categories we explicitly target. + +We provide example screenshots showing the text tokens we look for and recommend +verifying manually: + +![Detected tokens — first page](../../../assets/first_page.png) + +![Detected tokens — third page](../../../assets/third_page.png) \ No newline at end of file diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs new file mode 100644 index 0000000..1a3c840 --- /dev/null +++ b/src/anonymizer/anonymizer.rs @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: 2024-2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +mod detect; +mod pdf; +mod replace; + +use std::env; + +/// Entry point for programmatic invocation and CLI help text. +fn help_text() -> &'static str { + "etradeAnonymizer - Tool for anonymizing PDF files by replacing specific strings in FlateDecode streams.\n\ + \nUsage:\n\ + etradeAnonymizer detect \n\ + etradeAnonymizer replace [ ...]\n\ + \nExamples:\n\ + etradeAnonymizer detect statement.pdf\n\ + etradeAnonymizer replace input.pdf output.pdf \"JAN KOWALSKI\" \"XXXXX XXXXXXXX\"" +} + +/// Parse arguments and dispatch to detect / replace logic. Returns Ok even +/// for usage errors (prints help) to keep CLI simple. +pub fn run(args: Vec) -> Result<(), Box> { + if args.len() < 2 { + println!("{}", help_text()); + return Ok(()); + } + match args[1].as_str() { + "detect" => { + if args.len() != 3 { + println!("{}", help_text()); + return Ok(()); + } + detect::detect_pii(&args[2]) + } + "replace" => { + if args.len() < 6 || (args.len() - 4) % 2 != 0 { + println!("{}", help_text()); + return Ok(()); + } + let input_path = &args[2]; + let output_path = &args[3]; + let mut replacements: Vec<(String, String)> = Vec::new(); + let mut i = 4; + while i < args.len() - 1 { + replacements.push((args[i].clone(), args[i + 1].clone())); + i += 2; + } + replace::replace_mode(input_path, output_path, replacements) + } + _ => { + println!("{}", help_text()); + Ok(()) + } + } +} + +fn main() -> Result<(), Box> { + // Ensure users see warnings and errors by default even when RUST_LOG is not set. + // If RUST_LOG is provided, simple_logger will respect it; otherwise we default to `warn`. + if env::var("RUST_LOG").is_err() { + env::set_var("RUST_LOG", "warn"); + } + simple_logger::SimpleLogger::new().env().init().unwrap(); + + let args: Vec = env::args().collect(); + run(args) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + + // Helper to mock args + fn mock_args(args: &[&str]) -> Vec { + let mut v = vec!["etradeAnonymizer".to_string()]; + for a in args { + v.push(a.to_string()); + } + v + } + + // Note: These tests require 'anonymizer_data' directory to be present in the working directory + // when running 'cargo test'. + + #[test] + fn test_detect_mode() -> Result<(), Box> { + // This test captures stdout, which is tricky in Rust test harness without external crate. + // However, we can verify it runs without error. + + let sample = "anonymizer_data/sample_statement.pdf"; + if !std::path::Path::new(sample).exists() { + println!("Skipping test_detect_mode: {} not found", sample); + return Ok(()); + } + + let args = mock_args(&["detect", sample]); + run(args)?; + Ok(()) + } + + #[test] + fn test_replace_mode() -> Result<(), Box> { + let sample = "anonymizer_data/sample_statement.pdf"; + let expected_pdf = "anonymizer_data/expected_statement.pdf"; + let output_dir = "target/test_outputs"; + let output_pdf = "target/test_outputs/out_sample_statement.pdf"; + + if !std::path::Path::new(sample).exists() || !std::path::Path::new(expected_pdf).exists() { + println!("Skipping test_replace_mode: test data not found"); + return Ok(()); + } + + fs::create_dir_all(output_dir)?; + + // Arguments derived from expected_detect_output.txt content logic in original test + let args = mock_args(&[ + "replace", + sample, + output_pdf, + "JAN KOWALSKI", + "XXXXXXXXXXXX", + "UL. SWIETOKRZYSKA 12", + "XXXXXXXXXXXXXXXXXXXX", + "WARSAW 00-916 POLAND", + "XXXXXXXXXXXXXXXXXXXX", + "012 - 345678 - 910 -", + "XXXXXXXXXXXXXXXXXXXX", + "012-345678-910", + "XXXXXXXXXXXXXX", + ]); + + run(args)?; + + let produced = fs::read(output_pdf)?; + let expected = fs::read(expected_pdf)?; + assert_eq!(produced, expected, "produced PDF differs from expected"); + + // Cleanup + let _ = fs::remove_file(output_pdf); + Ok(()) + } +} diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs new file mode 100644 index 0000000..e24150d --- /dev/null +++ b/src/anonymizer/detect.rs @@ -0,0 +1,385 @@ +use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; +use log::{debug, info, warn}; + +pub(crate) struct DetectionConfig { + pub anchor_ms_account: &'static str, + pub anchor_for_period: &'static str, + pub anchor_for_name: &'static str, +} + +impl Default for DetectionConfig { + fn default() -> Self { + Self { + anchor_ms_account: "Morgan Stanley at Work Self-Directed Account", + anchor_for_period: "For the Period", + anchor_for_name: "FOR:", + } + } +} + +#[derive(Default, Debug)] +pub(crate) struct DetectionResult { + name: Option, + address_line1: Option, + address_line2: Option, + account_spaced: Option, + account_ms: Option, +} + +impl DetectionResult { + fn all_found(&self) -> bool { + self.name.is_some() + && self.address_line1.is_some() + && self.address_line2.is_some() + && self.account_spaced.is_some() + && self.account_ms.is_some() + } +} + +/// Detect PII tokens in `input_path` and print a replacement command line. +/// +/// The function inspects FlateDecode streams, extracts text tokens and heuristically +/// determines name/address/account tokens. It prints a single `replace` command +/// suitable for shell use. +pub fn detect_pii(input_path: &str) -> Result<(), Box> { + let pdf_data = match read_pdf(input_path) { + Ok(d) => d, + Err(_) => { + // Header validation already logged inside read_pdf + return Ok(()); + } + }; + + // let obj_re = Regex::new(OBJ_STREAM_RE).unwrap(); // Removed old regex initialization + let mut result = DetectionResult::default(); + let config = DetectionConfig::default(); + + for stream in stream_scanner(&pdf_data) { + if !stream.valid_end_marker { + warn!( + "Skipping stream due to end-marker mismatch for object at {}", + stream.object_start + ); + continue; + } + match extract_texts_from_stream(stream.compressed) { + Ok(extracted) => { + analyze_extracted_texts(&extracted, &mut result, &config); + if result.all_found() { + debug!("All target PII categories found; stopping search early."); + break; + } + } + Err(e) => { + warn!( + "Failed to extract texts from stream at {}: {}", + stream.object_start, e + ); + } + } + } + + let in_path = std::path::Path::new(input_path); + let parent = in_path + .parent() + .map(|p| p.to_string_lossy().into_owned()) + .unwrap_or_else(|| String::from("")); + let file_name = in_path + .file_name() + .map(|s| s.to_string_lossy().into_owned()) + .unwrap_or_else(|| input_path.to_string()); + let out_path = if parent.is_empty() { + format!("anonymous_{}", file_name) + } else { + format!("{}/anonymous_{}", parent, file_name) + }; + + // Build final ordered list: name, addr1, addr2, account_spaced, account_ms + let mut final_texts: Vec = Vec::new(); + let mut inserted = std::collections::HashSet::new(); + if let Some(n) = result.name.as_ref() { + if inserted.insert(n.clone()) { + final_texts.push(n.clone()); + } + } + if let Some(a1) = result.address_line1.as_ref() { + if inserted.insert(a1.clone()) { + final_texts.push(a1.clone()); + } + } + if let Some(a2) = result.address_line2.as_ref() { + if inserted.insert(a2.clone()) { + final_texts.push(a2.clone()); + } + } + if let Some(sp) = result.account_spaced.as_ref() { + if inserted.insert(sp.clone()) { + final_texts.push(sp.clone()); + } + } + if let Some(ms) = result.account_ms.as_ref() { + if inserted.insert(ms.clone()) { + final_texts.push(ms.clone()); + } + } + + print!("replace \"{}\" \"{}\"", input_path, out_path); + for txt in &final_texts { + let replacement = "X".repeat(txt.len()); + print!(" \"{}\" \"{}\"", txt, replacement); + } + println!(); + + Ok(()) +} + +pub(crate) fn analyze_extracted_texts( + extracted_texts: &[String], + result: &mut DetectionResult, + config: &DetectionConfig, +) { + debug!("Analyzing {} extracted tokens", extracted_texts.len()); + for (i, txt) in extracted_texts.iter().enumerate() { + debug!(" [{}] {}", i, txt); + } + // Run the composed helpers (implemented as top-level private helpers) + if find_account_after_anchor_in_stream(extracted_texts, result, config) { + return; + } + let for_search_start = find_spaced_account_and_start(extracted_texts, result, config); + handle_for_and_extract(extracted_texts, for_search_start, result, config); + validate_account_match(result); +} + +// helper: if address lines already known, look for the anchor in this stream and pick following token +fn find_account_after_anchor_in_stream( + extracted_texts: &[String], + result: &mut DetectionResult, + config: &DetectionConfig, +) -> bool { + if result.address_line1.is_some() + && result.address_line2.is_some() + && result.account_ms.is_none() + { + let anchor_text = config.anchor_ms_account; + for (idx, t) in extracted_texts.iter().enumerate() { + if t.contains(anchor_text) { + let mut next = idx + 1; + while next < extracted_texts.len() { + let cand_full = &extracted_texts[next]; + if !cand_full.is_empty() { + info!( + "Found account number after anchor (later stream): {}", + cand_full + ); + result.account_ms = Some(cand_full.clone()); + return true; + } + next += 1; + } + } + } + } + false +} + +// look for spaced account after "For the Period" and return start index for FOR: scanning +fn find_spaced_account_and_start( + extracted_texts: &[String], + result: &mut DetectionResult, + config: &DetectionConfig, +) -> usize { + let mut for_search_start: usize = 0; + for (i, txt) in extracted_texts.iter().enumerate() { + if txt.contains(config.anchor_for_period) && i + 3 < extracted_texts.len() { + let account_full = extracted_texts[i + 3].clone(); + let account = account_full.as_str(); + if account.contains(" - ") && account.chars().any(|c| c.is_numeric()) { + info!( + "Found account number (with spaces) after 'For the Period': {}", + account + ); + if result.account_spaced.is_none() { + result.account_spaced = Some(account_full.clone()); + } + // start FOR: search after the account token + for_search_start = i + 4; // i+3 is account token, so start after + break; + } + } + } + for_search_start +} + +// handle FOR: marker - extract name and next two non-empty tokens as address lines; attempt anchor-based ms account after +fn handle_for_and_extract( + extracted_texts: &[String], + start: usize, + result: &mut DetectionResult, + config: &DetectionConfig, +) { + for (i, txt) in extracted_texts.iter().enumerate().skip(start) { + if txt.contains(config.anchor_for_name) && i + 1 < extracted_texts.len() { + let name_full = extracted_texts[i + 1].clone(); + let name = name_full.as_str(); + if !name.is_empty() { + let mut ctx: Vec = Vec::new(); + for j in 0..4 { + if i + 1 + j < extracted_texts.len() { + ctx.push(extracted_texts[i + 1 + j].clone()); + } + } + info!( + "Found name after 'FOR:': {} -- context: {:?}", + name_full, ctx + ); + if result.name.is_none() { + result.name = Some(name_full.clone()); + } + } + + // Deterministic rule: unconditionally capture the next two non-empty tokens after the name. + // Prefer a later occurrence of the same name (some PDFs repeat the name and the address appears after the second occurrence). + let mut anchor_index = i + 1; // default: position of the name after FOR: + for k in (i + 2)..extracted_texts.len() { + if extracted_texts[k].contains(&name_full) { + anchor_index = k; + break; + } + } + + let mut collected = 0; + let mut look = 1; // start looking after the anchor name + while collected < 2 && anchor_index + look < extracted_texts.len() { + let candidate_full = extracted_texts[anchor_index + look].clone(); + let candidate = candidate_full.as_str(); + look += 1; + if candidate.is_empty() { + continue; + } + + // Always capture the next two non-empty tokens as address lines. + collected += 1; + if collected == 1 { + info!( + "Captured address_line1 after name (anchor_index={}): {} -- token_index={}", + anchor_index, + candidate, + anchor_index + look - 1 + ); + if result.address_line1.is_none() { + result.address_line1 = Some(candidate_full.clone()); + } + } else { + info!( + "Captured address_line2 after name (anchor_index={}): {} -- token_index={}", + anchor_index, + candidate, + anchor_index + look - 1 + ); + if result.address_line2.is_none() { + result.address_line2 = Some(candidate_full.clone()); + } + } + } + + // Immediately after capturing the two address lines, pick the first non-empty token + // that follows anchor + if result.address_line1.is_some() && result.address_line2.is_some() { + // First: look for the specific preceding anchor and take the next token. + let mut found_via_anchor = false; + let anchor_text = config.anchor_ms_account; + let mut anchor_idx = None; + for idx in (anchor_index + look)..extracted_texts.len() { + if extracted_texts[idx].contains(anchor_text) { + anchor_idx = Some(idx); + break; + } + } + + if let Some(ai) = anchor_idx { + let mut next = ai + 1; + while next < extracted_texts.len() { + let cand_full = extracted_texts[next].clone(); + if !cand_full.is_empty() { + info!( + "Found account number after anchor '{}' : {}", + anchor_text, cand_full + ); + result.account_ms = Some(cand_full.clone()); + found_via_anchor = true; + break; + } + next += 1; + } + } + + if found_via_anchor { + return; // found via anchor, we're done + } + } + } + } +} + +// Validate account spaced vs non-spaced (compare digits-only) +fn validate_account_match(result: &DetectionResult) { + if let (Some(spaced), Some(ms)) = (result.account_spaced.as_ref(), result.account_ms.as_ref()) { + let digits_only = |s: &str| s.chars().filter(|c| c.is_numeric()).collect::(); + let ds = digits_only(spaced); + let dm = digits_only(ms); + if ds == dm { + info!( + "Validated account: spaced='{}' matches non-spaced='{}'", + spaced, ms + ); + } else { + warn!( + "Account mismatch: spaced='{}' vs non-spaced='{}' (digits: {} != {})", + spaced, ms, ds, dm + ); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find_spaced_account_after_for_period() { + // Simulate a small token stream that might appear near the account header + let tokens = vec![ + "Account Summary".to_string(), + "For the Period September 1".to_string(), + "-".to_string(), + "30, 2025".to_string(), + "123 - 456789 - 012".to_string(), + ]; + let mut res = DetectionResult::default(); + let config = DetectionConfig::default(); + analyze_extracted_texts(&tokens, &mut res, &config); + assert_eq!(res.account_spaced, Some("123 - 456789 - 012".to_string())); + } + + #[test] + fn test_for_name_and_address_extraction_and_anchor_account() { + // Realistic token stream: FOR: name, address tokens, then account anchor and number + let tokens = vec![ + "FOR:".to_string(), + "John Doe".to_string(), + "123 Market St".to_string(), + "Cityville 12345".to_string(), + "Account Details".to_string(), + "Morgan Stanley at Work Self-Directed Account".to_string(), + "987654321".to_string(), + ]; + let mut res = DetectionResult::default(); + let config = DetectionConfig::default(); + analyze_extracted_texts(&tokens, &mut res, &config); + assert_eq!(res.name, Some("John Doe".to_string())); + assert_eq!(res.address_line1, Some("123 Market St".to_string())); + assert_eq!(res.address_line2, Some("Cityville 12345".to_string())); + assert_eq!(res.account_ms, Some("987654321".to_string())); + } +} diff --git a/src/anonymizer/pdf.rs b/src/anonymizer/pdf.rs new file mode 100644 index 0000000..ef4fb96 --- /dev/null +++ b/src/anonymizer/pdf.rs @@ -0,0 +1,293 @@ +//! PDF parsing utilities: header validation, stream extraction, text token parsing. +//! This module is intentionally strict and only supports a narrow subset of PDF +//! objects used by the target documents: FlateDecode streams with explicit /Length. +use flate2::read::ZlibDecoder; +use flate2::write::ZlibEncoder; +use flate2::Compression; +use log::{debug, error, info, warn}; +use regex::bytes::Regex; +use std::fs::File; +use std::io::{Read, Write}; + +// Centralized constants and helpers for PDF parsing to reduce duplication between detect/replace. +/// Expected PDF header (strictly enforced). +pub(crate) const PDF_HEADER: &[u8] = b"%PDF-1.3\n"; +/// Regex matching an object with FlateDecode stream and explicit /Length. +pub(crate) const OBJ_STREAM_RE: &str = r"(?s)\d+\s+\d+\s+obj\s*<<\s*/Length\s+(\d+)\s*/Filter\s*\[\s*/FlateDecode\s*\]\s*>>\s*stream\n"; + +/// Read entire PDF file and validate strict header. +pub(crate) fn read_pdf(path: &str) -> Result, Box> { + let mut file = File::open(path)?; + let mut pdf_data = Vec::new(); + file.read_to_end(&mut pdf_data)?; + if pdf_data.len() < PDF_HEADER.len() || &pdf_data[0..PDF_HEADER.len()] != PDF_HEADER { + error!( + "Unsupported PDF version or invalid PDF header at '{}'.", + path + ); + return Err("Invalid PDF header".into()); + } + Ok(pdf_data) +} + +// Lightweight representation of a PDF flate stream for detection-only workflow. +/// Lightweight representation of a FlateDecode stream slice inside a PDF. +pub(crate) struct StreamData<'a> { + pub object_start: usize, + pub data_start: usize, + pub compressed: &'a [u8], + pub valid_end_marker: bool, +} + +/// Iterator over stream objects, avoiding allocating a full Vec upfront. +pub(crate) struct StreamScanner<'a> { + re: Regex, + data: &'a [u8], + search_from: usize, +} + +/// Create a new streaming iterator over PDF FlateDecode objects. +pub(crate) fn stream_scanner<'a>(pdf_data: &'a [u8]) -> StreamScanner<'a> { + StreamScanner { + re: Regex::new(OBJ_STREAM_RE).unwrap(), + data: pdf_data, + search_from: 0, + } +} + +impl<'a> Iterator for StreamScanner<'a> { + type Item = StreamData<'a>; + fn next(&mut self) -> Option { + while self.search_from < self.data.len() { + // Use captures_at to find next match at current position + if let Some(caps) = self.re.captures_at(self.data, self.search_from) { + let whole = caps.get(0)?; + // advance search past this object to avoid infinite loop + self.search_from = whole.end(); + if let Some((compressed, data_start, valid)) = + extract_stream_bytes(self.data, &caps) + { + return Some(StreamData { + object_start: whole.start(), + data_start, + compressed, + valid_end_marker: valid, + }); + } else { + continue; // skip invalid capture + } + } else { + self.search_from = self.data.len(); + } + } + None + } +} + +/// Given a regex capture for an object, validate endmarker and return compressed stream bytes +/// Given a capture for a stream object, validate end marker and return the raw compressed data plus a validity flag. +pub(crate) fn extract_stream_bytes<'a>( + pdf_data: &'a [u8], + caps: ®ex::bytes::Captures<'a>, +) -> Option<(&'a [u8], usize, bool)> { + // Strict project rule: expected end marker is fixed here + const EXPECTED_END: &[u8] = b"\nendstream\nendobj"; + // Validate capture groups + let whole = match caps.get(0) { + Some(m) => m, + None => { + error!("PDF object capture missing whole-match"); + return None; + } + }; + let length_bytes = match caps.get(1) { + Some(m) => m.as_bytes(), + None => { + error!( + "PDF object capture missing /Length group at object starting {}", + whole.start() + ); + return None; + } + }; + + // Parse length strictly; if it fails, we consider this object invalid + let length = match std::str::from_utf8(length_bytes) + .ok() + .and_then(|s| s.parse::().ok()) + { + Some(v) => v, + None => { + error!( + "Invalid /Length value '{}' in object starting at {}", + String::from_utf8_lossy(length_bytes), + whole.start() + ); + return None; + } + }; + + let data_start = whole.end(); + let stream_end = match data_start.checked_add(length) { + Some(v) => v, + None => { + error!( + "Stream end overflow for object at {} (length={})", + data_start, length + ); + return None; + } + }; + + // strict bounds checks: must be entirely within pdf_data + if stream_end > pdf_data.len() { + error!( + "Stream end out of bounds for object starting at {}: stream_end={} pdf_len={}", + data_start, + stream_end, + pdf_data.len() + ); + return None; + } + if stream_end + EXPECTED_END.len() > pdf_data.len() { + error!( + "End marker out of bounds after stream_end {} for object starting at {} (pdf_len={})", + stream_end, + data_start, + pdf_data.len() + ); + return None; + } + + // Validate exact end marker (requirements are strict) + let debug_slice = &pdf_data[stream_end..stream_end + EXPECTED_END.len()]; + if debug_slice != EXPECTED_END { + warn!( + "End marker mismatch for object starting at {}: found {:?}, expected {:?}", + data_start, debug_slice, EXPECTED_END + ); + // Return decompressed candidate but indicate end marker mismatch for caller decision + return Some((&pdf_data[data_start..stream_end], data_start, false)); + } + + Some((&pdf_data[data_start..stream_end], data_start, true)) +} + +/// Decompress stream and extract text tokens from PDF text operators +/// Decompress a FlateDecode stream and extract text tokens appearing in `( .. ) Tj` operators. +pub(crate) fn extract_texts_from_stream( + compressed_data: &[u8], +) -> Result, Box> { + let mut decoder = ZlibDecoder::new(compressed_data); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed)?; + let text_re = + Regex::new(r"\(([^)]+)\)\s*Tj").map_err(|e| Box::new(e) as Box)?; + let mut extracted_texts: Vec = Vec::new(); + for text_caps in text_re.captures_iter(&decompressed) { + if let Some(txt) = text_caps.get(1) { + extracted_texts.push(String::from_utf8_lossy(txt.as_bytes()).to_string()); + } + } + + Ok(extracted_texts) +} + +// === Stream replacement & recompression utilities (migrated from streams.rs) === + +/// Replace all non-overlapping occurrences of `search` with `replace` in `data`. +fn replace_bytes_all_occurrences(data: &[u8], search: &[u8], replace: &[u8]) -> (Vec, usize) { + let mut result = Vec::new(); + let mut pos = 0; + let mut count = 0; + while pos < data.len() { + if pos + search.len() <= data.len() && &data[pos..pos + search.len()] == search { + result.extend_from_slice(replace); + pos += search.len(); + count += 1; + } else { + result.push(data[pos]); + pos += 1; + } + } + (result, count) +} + +/// Try progressive zlib compression levels (0..=9) returning the first compressed form whose length is <= `max_size`. +fn find_fitting_compression(data: &[u8], max_size: usize) -> Option<(Vec, u32)> { + for level in 0..=9 { + let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(level)); + if encoder.write_all(data).is_err() { + continue; + } + let compressed = encoder.finish().ok()?; + if compressed.len() <= max_size { + return Some((compressed, level)); + } + } + None +} + +/// Decompress a stream, apply all replacements, and recompress if possible within +/// the original compressed size. Returns new compressed bytes and per-pattern counts. +pub(crate) fn process_stream( + compressed_data: &[u8], + replacements: &[(String, String)], +) -> Result<(Vec, std::collections::HashMap), Box> { + let original_len = compressed_data.len(); + let mut decoder = ZlibDecoder::new(compressed_data); + let mut decompressed = Vec::new(); + match decoder.read_to_end(&mut decompressed) { + Ok(_) => { + debug!("Decompressed: {} B", decompressed.len()); + let mut modified_data = decompressed.clone(); + let mut found_any = false; + let mut per_counts: std::collections::HashMap = + std::collections::HashMap::new(); + for (needle, repl) in replacements { + let (new_data, occurrences) = replace_bytes_all_occurrences( + &modified_data, + needle.as_bytes(), + repl.as_bytes(), + ); + if occurrences > 0 { + debug!("Found '{}' {} times", needle, occurrences); + modified_data = new_data; + per_counts.insert(needle.clone(), occurrences); + found_any = true; + } + } + if found_any { + if let Some((fitting, level)) = + find_fitting_compression(&modified_data, original_len) + { + debug!( + "Compression level {} produced {} B (<= {} B)", + level, + fitting.len(), + original_len + ); + info!( + "Compressed stream with level {} ({} B)", + level, + fitting.len() + ); + return Ok((fitting, per_counts)); + } else { + warn!( + "All compression levels exceed original size {}; keeping original. PII MAY REMAIN EXPOSED!", + original_len + ); + info!( + "Falling back to original compressed stream ({} B)", + original_len + ); + } + } + } + Err(e) => { + error!("Decompression error: {}", e); + } + } + Ok((compressed_data.to_vec(), std::collections::HashMap::new())) +} diff --git a/src/anonymizer/replace.rs b/src/anonymizer/replace.rs new file mode 100644 index 0000000..86e0b8c --- /dev/null +++ b/src/anonymizer/replace.rs @@ -0,0 +1,111 @@ +use super::pdf::{process_stream, read_pdf, stream_scanner}; +use log::{debug, info, warn}; +use std::fs::File; +use std::io::Write; + +/// Replace occurrences of given `replacements` inside FlateDecode streams of `input_path` and +/// write modified PDF to `output_path`. +/// +/// Each replacement is a `(original, replacement)` pair. Compression is retried to fit +/// the original stream size; if impossible, the original compressed stream is preserved. +pub(crate) fn replace_mode( + input_path: &str, + output_path: &str, + replacements: Vec<(String, String)>, +) -> Result<(), Box> { + info!("Loading: {}", input_path); + + let pdf_data = match read_pdf(input_path) { + Ok(d) => d, + Err(_) => return Ok(()), // header or open error already logged + }; + + debug!("PDF Size: {} bytes", pdf_data.len()); + + let mut output_data = pdf_data.clone(); + let mut streams_modified = 0; + let mut streams_total = 0; + // Aggregate counts per replacement for concise summary + let mut replacement_counts: std::collections::HashMap = + std::collections::HashMap::new(); + + for stream in stream_scanner(&pdf_data) { + if !stream.valid_end_marker { + warn!( + "Skipping stream due to end-marker mismatch for object at {}", + stream.object_start + ); + continue; + } + streams_total += 1; + + let compressed_data = stream.compressed; + let data_start = stream.data_start; + let stream_end = data_start + compressed_data.len(); + + // decompress modify recompress to exact same size + debug!("═══ Stream #{} ═══", streams_total); + debug!( + "Position: {}-{} ({} B)", + data_start, + stream_end, + compressed_data.len() + ); + let (new_compressed_data, stream_replacement_counts) = + process_stream(compressed_data, &replacements)?; + // aggregate counts from this stream + let mut stream_total = 0usize; + if !stream_replacement_counts.is_empty() { + for (k, v) in stream_replacement_counts.iter() { + *replacement_counts.entry(k.clone()).or_insert(0) += *v; + stream_total += *v; + } + } + + if stream_total > 0 { + streams_modified += 1; + } + + // write into output data + for (idx, &byte) in new_compressed_data.iter().enumerate() { + output_data[data_start + idx] = byte; + } + debug!( + "Compression: {} → {} B", + compressed_data.len(), + new_compressed_data.len() + ); + + let padding_len = compressed_data.len() - new_compressed_data.len(); + for idx in new_compressed_data.len()..compressed_data.len() { + output_data[data_start + idx] = 0x00; + } + + if padding_len > 0 { + info!( + "Applied padding of {} bytes to stream at {}", + padding_len, data_start + ); + } + } + + info!("Saving: {}", output_path); + File::create(output_path)?.write_all(&output_data)?; + + info!("DONE!"); + info!( + "Streams: total={} modified={}", + streams_total, streams_modified + ); + let replacements_total: usize = replacement_counts.values().sum(); + info!("Replacements total: {}", replacements_total); + if !replacement_counts.is_empty() { + info!("Breakdown:"); + for (k, v) in replacement_counts.iter() { + info!(" '{}' -> {}", k, v); + } + } + info!("File: {}", output_path); + + Ok(()) +} From d863116900ddfc58095356c85e6ddf50c7caca93 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Sat, 29 Nov 2025 22:49:42 +0100 Subject: [PATCH 02/21] also detect #id --- src/anonymizer/detect.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index e24150d..936ff2d 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -19,6 +19,7 @@ impl Default for DetectionConfig { #[derive(Default, Debug)] pub(crate) struct DetectionResult { + id: Option, name: Option, address_line1: Option, address_line2: Option, @@ -28,7 +29,8 @@ pub(crate) struct DetectionResult { impl DetectionResult { fn all_found(&self) -> bool { - self.name.is_some() + self.id.is_some() + && self.name.is_some() && self.address_line1.is_some() && self.address_line2.is_some() && self.account_spaced.is_some() @@ -97,6 +99,11 @@ pub fn detect_pii(input_path: &str) -> Result<(), Box> { // Build final ordered list: name, addr1, addr2, account_spaced, account_ms let mut final_texts: Vec = Vec::new(); let mut inserted = std::collections::HashSet::new(); + if let Some(id) = result.id.as_ref() { + if inserted.insert(id.clone()) { + final_texts.push(id.clone()); + } + } if let Some(n) = result.name.as_ref() { if inserted.insert(n.clone()) { final_texts.push(n.clone()); @@ -248,6 +255,15 @@ fn handle_for_and_extract( } } + // If we found a later occurrence, check for ID immediately before it. + if anchor_index > i + 1 { + let id_candidate = &extracted_texts[anchor_index - 1]; + if !id_candidate.is_empty() { + info!("Found ID before name anchor: {}", id_candidate); + result.id = Some(id_candidate.clone()); + } + } + let mut collected = 0; let mut look = 1; // start looking after the anchor name while collected < 2 && anchor_index + look < extracted_texts.len() { From d24aa575e065827971b64ca6a43b05e5193970d5 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Sun, 7 Dec 2025 15:03:20 +0100 Subject: [PATCH 03/21] add list mode add use std::error::Error return Err instead of Ok after read_pdf --- src/anonymizer/anonymizer.rs | 22 ++++++++++++++++------ src/anonymizer/detect.rs | 8 +------- src/anonymizer/list.rs | 33 +++++++++++++++++++++++++++++++++ src/anonymizer/pdf.rs | 9 +++++---- 4 files changed, 55 insertions(+), 17 deletions(-) create mode 100644 src/anonymizer/list.rs diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs index 1a3c840..760419e 100644 --- a/src/anonymizer/anonymizer.rs +++ b/src/anonymizer/anonymizer.rs @@ -1,16 +1,19 @@ // SPDX-FileCopyrightText: 2024-2025 RustInFinance // SPDX-License-Identifier: BSD-3-Clause +mod list; mod detect; mod pdf; mod replace; use std::env; +use std::error::Error; /// Entry point for programmatic invocation and CLI help text. fn help_text() -> &'static str { "etradeAnonymizer - Tool for anonymizing PDF files by replacing specific strings in FlateDecode streams.\n\ \nUsage:\n\ + etradeAnonymizer list \n\ etradeAnonymizer detect \n\ etradeAnonymizer replace [ ...]\n\ \nExamples:\n\ @@ -20,23 +23,30 @@ fn help_text() -> &'static str { /// Parse arguments and dispatch to detect / replace logic. Returns Ok even /// for usage errors (prints help) to keep CLI simple. -pub fn run(args: Vec) -> Result<(), Box> { +pub fn run(args: Vec) -> Result<(), Box> { if args.len() < 2 { println!("{}", help_text()); return Ok(()); } match args[1].as_str() { + "list" => { + if args.len() != 3 { + println!("{}", help_text()); + return Err("Invalid use of list".into()); + } + list::list_texts(&args[2]) + } "detect" => { if args.len() != 3 { println!("{}", help_text()); - return Ok(()); + return Err("Invalid use of detect".into()); } detect::detect_pii(&args[2]) } "replace" => { if args.len() < 6 || (args.len() - 4) % 2 != 0 { println!("{}", help_text()); - return Ok(()); + return Err("Invalid use of replace".into()); } let input_path = &args[2]; let output_path = &args[3]; @@ -55,7 +65,7 @@ pub fn run(args: Vec) -> Result<(), Box> { } } -fn main() -> Result<(), Box> { +fn main() -> Result<(), Box> { // Ensure users see warnings and errors by default even when RUST_LOG is not set. // If RUST_LOG is provided, simple_logger will respect it; otherwise we default to `warn`. if env::var("RUST_LOG").is_err() { @@ -85,7 +95,7 @@ mod tests { // when running 'cargo test'. #[test] - fn test_detect_mode() -> Result<(), Box> { + fn test_detect_mode() -> Result<(), Box> { // This test captures stdout, which is tricky in Rust test harness without external crate. // However, we can verify it runs without error. @@ -101,7 +111,7 @@ mod tests { } #[test] - fn test_replace_mode() -> Result<(), Box> { + fn test_replace_mode() -> Result<(), Box> { let sample = "anonymizer_data/sample_statement.pdf"; let expected_pdf = "anonymizer_data/expected_statement.pdf"; let output_dir = "target/test_outputs"; diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index 936ff2d..4fea513 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -44,13 +44,7 @@ impl DetectionResult { /// determines name/address/account tokens. It prints a single `replace` command /// suitable for shell use. pub fn detect_pii(input_path: &str) -> Result<(), Box> { - let pdf_data = match read_pdf(input_path) { - Ok(d) => d, - Err(_) => { - // Header validation already logged inside read_pdf - return Ok(()); - } - }; + let pdf_data = read_pdf(input_path)?; // let obj_re = Regex::new(OBJ_STREAM_RE).unwrap(); // Removed old regex initialization let mut result = DetectionResult::default(); diff --git a/src/anonymizer/list.rs b/src/anonymizer/list.rs new file mode 100644 index 0000000..a807266 --- /dev/null +++ b/src/anonymizer/list.rs @@ -0,0 +1,33 @@ +use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; +use log::{info,warn}; + +pub fn list_texts(input_path : &str) -> Result<(), Box> { + let pdf_data = read_pdf(input_path)?; + + let mut global_text_id = 0; + for (stream_id, stream) in stream_scanner(&pdf_data).enumerate() { + if !stream.valid_end_marker { + warn!( + "Skipping stream due to end-marker mismatch for object at {}", + stream.object_start + ); + continue; + } + match extract_texts_from_stream(stream.compressed) { + Ok(extracted_texts) => { + info!("stream {} has {} extracted tokens", stream_id, extracted_texts.len()); + for txt in extracted_texts.iter() { + println!(" [{}] {}", global_text_id, txt); + global_text_id+=1; + } + } + Err(e) => { + warn!( + "Failed to extract texts from stream at {}: {}", + stream.object_start, e + ); + } + } + } + Ok(()) +} diff --git a/src/anonymizer/pdf.rs b/src/anonymizer/pdf.rs index ef4fb96..aefa77e 100644 --- a/src/anonymizer/pdf.rs +++ b/src/anonymizer/pdf.rs @@ -8,6 +8,7 @@ use log::{debug, error, info, warn}; use regex::bytes::Regex; use std::fs::File; use std::io::{Read, Write}; +use std::error::Error; // Centralized constants and helpers for PDF parsing to reduce duplication between detect/replace. /// Expected PDF header (strictly enforced). @@ -16,7 +17,7 @@ pub(crate) const PDF_HEADER: &[u8] = b"%PDF-1.3\n"; pub(crate) const OBJ_STREAM_RE: &str = r"(?s)\d+\s+\d+\s+obj\s*<<\s*/Length\s+(\d+)\s*/Filter\s*\[\s*/FlateDecode\s*\]\s*>>\s*stream\n"; /// Read entire PDF file and validate strict header. -pub(crate) fn read_pdf(path: &str) -> Result, Box> { +pub(crate) fn read_pdf(path: &str) -> Result, Box> { let mut file = File::open(path)?; let mut pdf_data = Vec::new(); file.read_to_end(&mut pdf_data)?; @@ -177,12 +178,12 @@ pub(crate) fn extract_stream_bytes<'a>( /// Decompress a FlateDecode stream and extract text tokens appearing in `( .. ) Tj` operators. pub(crate) fn extract_texts_from_stream( compressed_data: &[u8], -) -> Result, Box> { +) -> Result, Box> { let mut decoder = ZlibDecoder::new(compressed_data); let mut decompressed = Vec::new(); decoder.read_to_end(&mut decompressed)?; let text_re = - Regex::new(r"\(([^)]+)\)\s*Tj").map_err(|e| Box::new(e) as Box)?; + Regex::new(r"\(([^)]+)\)\s*Tj").map_err(|e| Box::new(e) as Box)?; let mut extracted_texts: Vec = Vec::new(); for text_caps in text_re.captures_iter(&decompressed) { if let Some(txt) = text_caps.get(1) { @@ -233,7 +234,7 @@ fn find_fitting_compression(data: &[u8], max_size: usize) -> Option<(Vec, u3 pub(crate) fn process_stream( compressed_data: &[u8], replacements: &[(String, String)], -) -> Result<(Vec, std::collections::HashMap), Box> { +) -> Result<(Vec, std::collections::HashMap), Box> { let original_len = compressed_data.len(); let mut decoder = ZlibDecoder::new(compressed_data); let mut decompressed = Vec::new(); From df1e0ba6a55fe23a47933f82be2ae05d8c353c94 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Wed, 10 Dec 2025 22:22:55 +0100 Subject: [PATCH 04/21] omit forgotten std::error in detect.rs --- src/anonymizer/detect.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index 4fea513..760af5a 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -1,5 +1,6 @@ use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; use log::{debug, info, warn}; +use std::error::Error; pub(crate) struct DetectionConfig { pub anchor_ms_account: &'static str, @@ -43,7 +44,7 @@ impl DetectionResult { /// The function inspects FlateDecode streams, extracts text tokens and heuristically /// determines name/address/account tokens. It prints a single `replace` command /// suitable for shell use. -pub fn detect_pii(input_path: &str) -> Result<(), Box> { +pub fn detect_pii(input_path: &str) -> Result<(), Box> { let pdf_data = read_pdf(input_path)?; // let obj_re = Regex::new(OBJ_STREAM_RE).unwrap(); // Removed old regex initialization From 63bcdee8f4f3803026957509560f3ed544f69082 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Wed, 10 Dec 2025 22:19:17 +0100 Subject: [PATCH 05/21] extract code to fn find_putput_path --- src/anonymizer/detect.rs | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index 760af5a..e4c684f 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -76,20 +76,7 @@ pub fn detect_pii(input_path: &str) -> Result<(), Box> { } } - let in_path = std::path::Path::new(input_path); - let parent = in_path - .parent() - .map(|p| p.to_string_lossy().into_owned()) - .unwrap_or_else(|| String::from("")); - let file_name = in_path - .file_name() - .map(|s| s.to_string_lossy().into_owned()) - .unwrap_or_else(|| input_path.to_string()); - let out_path = if parent.is_empty() { - format!("anonymous_{}", file_name) - } else { - format!("{}/anonymous_{}", parent, file_name) - }; + let out_path = find_output_path(input_path); // Build final ordered list: name, addr1, addr2, account_spaced, account_ms let mut final_texts: Vec = Vec::new(); @@ -135,6 +122,23 @@ pub fn detect_pii(input_path: &str) -> Result<(), Box> { Ok(()) } +pub find_output_path(in_path: &str) { + let in_path = std::path::Path::new(input_path); + let parent = in_path + .parent() + .map(|p| p.to_string_lossy().into_owned()) + .unwrap_or_else(|| String::from("")); + let file_name = in_path + .file_name() + .map(|s| s.to_string_lossy().into_owned()) + .unwrap_or_else(|| input_path.to_string()); + let out_path = if parent.is_empty() { + format!("anonymous_{}", file_name) + } else { + format!("{}/anonymous_{}", parent, file_name) + }; +} + pub(crate) fn analyze_extracted_texts( extracted_texts: &[String], result: &mut DetectionResult, From 4664a760a0d78694d4378971bb52627e39d04f50 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Wed, 10 Dec 2025 22:49:01 +0100 Subject: [PATCH 06/21] extract path modification function to anonymizer/path.rs --- src/anonymizer/anonymizer.rs | 1 + src/anonymizer/detect.rs | 21 +++------------- src/anonymizer/path.rs | 47 ++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 18 deletions(-) create mode 100644 src/anonymizer/path.rs diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs index 760419e..f7c68c3 100644 --- a/src/anonymizer/anonymizer.rs +++ b/src/anonymizer/anonymizer.rs @@ -3,6 +3,7 @@ mod list; mod detect; +mod path; mod pdf; mod replace; diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index e4c684f..9ff4c05 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -76,7 +76,7 @@ pub fn detect_pii(input_path: &str) -> Result<(), Box> { } } - let out_path = find_output_path(input_path); + let out_path = super::path::anonymous_output_path(input_path); // Build final ordered list: name, addr1, addr2, account_spaced, account_ms let mut final_texts: Vec = Vec::new(); @@ -112,7 +112,7 @@ pub fn detect_pii(input_path: &str) -> Result<(), Box> { } } - print!("replace \"{}\" \"{}\"", input_path, out_path); + print!("replace \"{}\" \"{}\"", input_path, out_path.display()); for txt in &final_texts { let replacement = "X".repeat(txt.len()); print!(" \"{}\" \"{}\"", txt, replacement); @@ -122,22 +122,7 @@ pub fn detect_pii(input_path: &str) -> Result<(), Box> { Ok(()) } -pub find_output_path(in_path: &str) { - let in_path = std::path::Path::new(input_path); - let parent = in_path - .parent() - .map(|p| p.to_string_lossy().into_owned()) - .unwrap_or_else(|| String::from("")); - let file_name = in_path - .file_name() - .map(|s| s.to_string_lossy().into_owned()) - .unwrap_or_else(|| input_path.to_string()); - let out_path = if parent.is_empty() { - format!("anonymous_{}", file_name) - } else { - format!("{}/anonymous_{}", parent, file_name) - }; -} + pub(crate) fn analyze_extracted_texts( extracted_texts: &[String], diff --git a/src/anonymizer/path.rs b/src/anonymizer/path.rs new file mode 100644 index 0000000..63ba66e --- /dev/null +++ b/src/anonymizer/path.rs @@ -0,0 +1,47 @@ +use std::path::PathBuf; + +/// Build an output path by prefixing the input filename with `anonymous_`. +/// +/// Preserves the parent directory if present and returns a `PathBuf`. +pub(crate) fn anonymous_output_path>(in_path: P) -> PathBuf { + let input_path = in_path.as_ref(); + + let file_name = input_path + .file_name() + .map(|s| s.to_string_lossy().into_owned()) + .unwrap_or_else(|| input_path.to_string_lossy().into_owned()); + + if let Some(parent) = input_path.parent() { + let mut pb = PathBuf::from(parent); + pb.push(format!("anonymous_{}", file_name)); + pb + } else { + PathBuf::from(format!("anonymous_{}", file_name)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_anonymous_output_path_no_parent() { + let in_path = "statement.pdf"; + let out = anonymous_output_path(in_path); + assert_eq!(out, std::path::PathBuf::from("anonymous_statement.pdf")); + } + + #[test] + fn test_anonymous_output_path_with_parent() { + let in_path = "some/dir/statement.pdf"; + let out = anonymous_output_path(in_path); + assert_eq!(out, std::path::PathBuf::from("some/dir/anonymous_statement.pdf")); + } + + #[test] + fn test_anonymous_output_path_unicode_filename() { + let in_path = "résumé.pdf"; + let out = anonymous_output_path(in_path); + assert_eq!(out, std::path::PathBuf::from("anonymous_résumé.pdf")); + } +} From 33611d88918e7f92ead7e18952ee1b6e43e8e616 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Wed, 10 Dec 2025 22:53:31 +0100 Subject: [PATCH 07/21] use path ref broadly --- src/anonymizer/detect.rs | 9 +++++---- src/anonymizer/pdf.rs | 7 ++++--- src/anonymizer/replace.rs | 18 ++++++++++-------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index 9ff4c05..4a94a8d 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -44,8 +44,9 @@ impl DetectionResult { /// The function inspects FlateDecode streams, extracts text tokens and heuristically /// determines name/address/account tokens. It prints a single `replace` command /// suitable for shell use. -pub fn detect_pii(input_path: &str) -> Result<(), Box> { - let pdf_data = read_pdf(input_path)?; +pub fn detect_pii>(input_path: P) -> Result<(), Box> { + let input_path_ref = input_path.as_ref(); + let pdf_data = read_pdf(input_path_ref)?; // let obj_re = Regex::new(OBJ_STREAM_RE).unwrap(); // Removed old regex initialization let mut result = DetectionResult::default(); @@ -76,7 +77,7 @@ pub fn detect_pii(input_path: &str) -> Result<(), Box> { } } - let out_path = super::path::anonymous_output_path(input_path); + let out_path = super::path::anonymous_output_path(input_path_ref); // Build final ordered list: name, addr1, addr2, account_spaced, account_ms let mut final_texts: Vec = Vec::new(); @@ -112,7 +113,7 @@ pub fn detect_pii(input_path: &str) -> Result<(), Box> { } } - print!("replace \"{}\" \"{}\"", input_path, out_path.display()); + print!("replace \"{}\" \"{}\"", input_path_ref.display(), out_path.display()); for txt in &final_texts { let replacement = "X".repeat(txt.len()); print!(" \"{}\" \"{}\"", txt, replacement); diff --git a/src/anonymizer/pdf.rs b/src/anonymizer/pdf.rs index aefa77e..adc6b36 100644 --- a/src/anonymizer/pdf.rs +++ b/src/anonymizer/pdf.rs @@ -17,14 +17,15 @@ pub(crate) const PDF_HEADER: &[u8] = b"%PDF-1.3\n"; pub(crate) const OBJ_STREAM_RE: &str = r"(?s)\d+\s+\d+\s+obj\s*<<\s*/Length\s+(\d+)\s*/Filter\s*\[\s*/FlateDecode\s*\]\s*>>\s*stream\n"; /// Read entire PDF file and validate strict header. -pub(crate) fn read_pdf(path: &str) -> Result, Box> { - let mut file = File::open(path)?; +pub(crate) fn read_pdf>(path: P) -> Result, Box> { + let path_ref = path.as_ref(); + let mut file = File::open(path_ref)?; let mut pdf_data = Vec::new(); file.read_to_end(&mut pdf_data)?; if pdf_data.len() < PDF_HEADER.len() || &pdf_data[0..PDF_HEADER.len()] != PDF_HEADER { error!( "Unsupported PDF version or invalid PDF header at '{}'.", - path + path_ref.display() ); return Err("Invalid PDF header".into()); } diff --git a/src/anonymizer/replace.rs b/src/anonymizer/replace.rs index 86e0b8c..00aaa52 100644 --- a/src/anonymizer/replace.rs +++ b/src/anonymizer/replace.rs @@ -8,14 +8,16 @@ use std::io::Write; /// /// Each replacement is a `(original, replacement)` pair. Compression is retried to fit /// the original stream size; if impossible, the original compressed stream is preserved. -pub(crate) fn replace_mode( - input_path: &str, - output_path: &str, +pub(crate) fn replace_mode>( + input_path: P, + output_path: P, replacements: Vec<(String, String)>, ) -> Result<(), Box> { - info!("Loading: {}", input_path); + let input_path_ref = input_path.as_ref(); + let output_path_ref = output_path.as_ref(); + info!("Loading: {}", input_path_ref.display()); - let pdf_data = match read_pdf(input_path) { + let pdf_data = match read_pdf(input_path_ref) { Ok(d) => d, Err(_) => return Ok(()), // header or open error already logged }; @@ -89,8 +91,8 @@ pub(crate) fn replace_mode( } } - info!("Saving: {}", output_path); - File::create(output_path)?.write_all(&output_data)?; + info!("Saving: {}", output_path_ref.display()); + File::create(output_path_ref)?.write_all(&output_data)?; info!("DONE!"); info!( @@ -105,7 +107,7 @@ pub(crate) fn replace_mode( info!(" '{}' -> {}", k, v); } } - info!("File: {}", output_path); + info!("File: {}", output_path_ref.display()); Ok(()) } From 7514747a2a629e32605f49977d3082618cd0c46e Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Wed, 10 Dec 2025 23:04:00 +0100 Subject: [PATCH 08/21] switch to clap --- Cargo.toml | 2 +- src/anonymizer/anonymizer.rs | 159 ++++++++++++++--------------------- 2 files changed, 66 insertions(+), 95 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3d42011..b1979e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ serde = { version = "1.0.104", features = ["derive"] } roxmltree = "0.20.0" simple_logger = "4.0.0" log = "0.4.0" -clap = "~2.27.0" +clap = { version = "4.5.51", features = ["derive"] } regex = "1.3.3" calamine = "0.22.1" wild = "2.2.0" diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs index f7c68c3..c3757b6 100644 --- a/src/anonymizer/anonymizer.rs +++ b/src/anonymizer/anonymizer.rs @@ -7,63 +7,41 @@ mod path; mod pdf; mod replace; +use clap::{Parser, Subcommand}; use std::env; use std::error::Error; -/// Entry point for programmatic invocation and CLI help text. -fn help_text() -> &'static str { - "etradeAnonymizer - Tool for anonymizing PDF files by replacing specific strings in FlateDecode streams.\n\ - \nUsage:\n\ - etradeAnonymizer list \n\ - etradeAnonymizer detect \n\ - etradeAnonymizer replace [ ...]\n\ - \nExamples:\n\ - etradeAnonymizer detect statement.pdf\n\ - etradeAnonymizer replace input.pdf output.pdf \"JAN KOWALSKI\" \"XXXXX XXXXXXXX\"" +/// Tool for anonymizing PDF files by replacing specific strings in FlateDecode streams +#[derive(Parser)] +#[command(name = "etradeAnonymizer")] +#[command(version, about, long_about = None)] +struct Cli { + #[command(subcommand)] + command: Commands, } -/// Parse arguments and dispatch to detect / replace logic. Returns Ok even -/// for usage errors (prints help) to keep CLI simple. -pub fn run(args: Vec) -> Result<(), Box> { - if args.len() < 2 { - println!("{}", help_text()); - return Ok(()); - } - match args[1].as_str() { - "list" => { - if args.len() != 3 { - println!("{}", help_text()); - return Err("Invalid use of list".into()); - } - list::list_texts(&args[2]) - } - "detect" => { - if args.len() != 3 { - println!("{}", help_text()); - return Err("Invalid use of detect".into()); - } - detect::detect_pii(&args[2]) - } - "replace" => { - if args.len() < 6 || (args.len() - 4) % 2 != 0 { - println!("{}", help_text()); - return Err("Invalid use of replace".into()); - } - let input_path = &args[2]; - let output_path = &args[3]; - let mut replacements: Vec<(String, String)> = Vec::new(); - let mut i = 4; - while i < args.len() - 1 { - replacements.push((args[i].clone(), args[i + 1].clone())); - i += 2; - } - replace::replace_mode(input_path, output_path, replacements) - } - _ => { - println!("{}", help_text()); - Ok(()) - } - } +#[derive(Subcommand)] +enum Commands { + /// List all text tokens from FlateDecode streams in the PDF + List { + /// Path to the input PDF file + input_file: String, + }, + /// Detect PII (name, address, account) in the PDF and print replacement command + Detect { + /// Path to the input PDF file + input_file: String, + }, + /// Replace strings in PDF FlateDecode streams and save to output file + Replace { + /// Path to the input PDF file + input_file: String, + /// Path to the output PDF file + output_file: String, + /// Pairs of strings to replace: ... + #[arg(required = true, num_args = 2..)] + replacements: Vec, + }, } fn main() -> Result<(), Box> { @@ -74,8 +52,28 @@ fn main() -> Result<(), Box> { } simple_logger::SimpleLogger::new().env().init().unwrap(); - let args: Vec = env::args().collect(); - run(args) + let cli = Cli::parse(); + + match cli.command { + Commands::List { input_file } => list::list_texts(&input_file), + Commands::Detect { input_file } => detect::detect_pii(&input_file), + Commands::Replace { + input_file, + output_file, + replacements, + } => { + if replacements.len() % 2 != 0 { + return Err("Replacements must be provided as pairs: ".into()); + } + let mut replacement_pairs: Vec<(String, String)> = Vec::new(); + let mut i = 0; + while i < replacements.len() { + replacement_pairs.push((replacements[i].clone(), replacements[i + 1].clone())); + i += 2; + } + replace::replace_mode(&input_file, &output_file, replacement_pairs) + } + } } #[cfg(test)] @@ -83,31 +81,15 @@ mod tests { use super::*; use std::fs; - // Helper to mock args - fn mock_args(args: &[&str]) -> Vec { - let mut v = vec!["etradeAnonymizer".to_string()]; - for a in args { - v.push(a.to_string()); - } - v - } - // Note: These tests require 'anonymizer_data' directory to be present in the working directory // when running 'cargo test'. #[test] fn test_detect_mode() -> Result<(), Box> { - // This test captures stdout, which is tricky in Rust test harness without external crate. - // However, we can verify it runs without error. - let sample = "anonymizer_data/sample_statement.pdf"; - if !std::path::Path::new(sample).exists() { - println!("Skipping test_detect_mode: {} not found", sample); - return Ok(()); - } + assert!(std::path::Path::new(sample).exists(), "Required test file missing: {}", sample); - let args = mock_args(&["detect", sample]); - run(args)?; + detect::detect_pii(sample)?; Ok(()) } @@ -118,31 +100,20 @@ mod tests { let output_dir = "target/test_outputs"; let output_pdf = "target/test_outputs/out_sample_statement.pdf"; - if !std::path::Path::new(sample).exists() || !std::path::Path::new(expected_pdf).exists() { - println!("Skipping test_replace_mode: test data not found"); - return Ok(()); - } + assert!(std::path::Path::new(sample).exists(), "Required test file missing: {}", sample); + assert!(std::path::Path::new(expected_pdf).exists(), "Required test file missing: {}", expected_pdf); fs::create_dir_all(output_dir)?; - // Arguments derived from expected_detect_output.txt content logic in original test - let args = mock_args(&[ - "replace", - sample, - output_pdf, - "JAN KOWALSKI", - "XXXXXXXXXXXX", - "UL. SWIETOKRZYSKA 12", - "XXXXXXXXXXXXXXXXXXXX", - "WARSAW 00-916 POLAND", - "XXXXXXXXXXXXXXXXXXXX", - "012 - 345678 - 910 -", - "XXXXXXXXXXXXXXXXXXXX", - "012-345678-910", - "XXXXXXXXXXXXXX", - ]); - - run(args)?; + let replacements = vec![ + ("JAN KOWALSKI".to_string(), "XXXXXXXXXXXX".to_string()), + ("UL. SWIETOKRZYSKA 12".to_string(), "XXXXXXXXXXXXXXXXXXXX".to_string()), + ("WARSAW 00-916 POLAND".to_string(), "XXXXXXXXXXXXXXXXXXXX".to_string()), + ("012 - 345678 - 910 -".to_string(), "XXXXXXXXXXXXXXXXXXXX".to_string()), + ("012-345678-910".to_string(), "XXXXXXXXXXXXXX".to_string()), + ]; + + replace::replace_mode(sample, output_pdf, replacements)?; let produced = fs::read(output_pdf)?; let expected = fs::read(expected_pdf)?; From 4606fc3c2237c4810180e79709ed6e2cfbc647b5 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 12 Dec 2025 21:26:18 +0100 Subject: [PATCH 09/21] use Path and PathBuf and minor improvements --- src/anonymizer/anonymizer.rs | 39 ++++++++++++++++++------------------ src/anonymizer/detect.rs | 26 +++++++++++------------- src/anonymizer/list.rs | 6 ++++-- src/anonymizer/path.rs | 10 ++++----- src/anonymizer/pdf.rs | 7 +++---- src/anonymizer/replace.rs | 23 ++++++++++----------- 6 files changed, 53 insertions(+), 58 deletions(-) diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs index c3757b6..d79076a 100644 --- a/src/anonymizer/anonymizer.rs +++ b/src/anonymizer/anonymizer.rs @@ -10,6 +10,7 @@ mod replace; use clap::{Parser, Subcommand}; use std::env; use std::error::Error; +use std::path::PathBuf; /// Tool for anonymizing PDF files by replacing specific strings in FlateDecode streams #[derive(Parser)] @@ -25,19 +26,19 @@ enum Commands { /// List all text tokens from FlateDecode streams in the PDF List { /// Path to the input PDF file - input_file: String, + input_file: PathBuf, }, /// Detect PII (name, address, account) in the PDF and print replacement command Detect { /// Path to the input PDF file - input_file: String, + input_file: PathBuf, }, /// Replace strings in PDF FlateDecode streams and save to output file Replace { /// Path to the input PDF file - input_file: String, + input_file: PathBuf, /// Path to the output PDF file - output_file: String, + output_file: PathBuf, /// Pairs of strings to replace: ... #[arg(required = true, num_args = 2..)] replacements: Vec, @@ -65,13 +66,11 @@ fn main() -> Result<(), Box> { if replacements.len() % 2 != 0 { return Err("Replacements must be provided as pairs: ".into()); } - let mut replacement_pairs: Vec<(String, String)> = Vec::new(); - let mut i = 0; - while i < replacements.len() { - replacement_pairs.push((replacements[i].clone(), replacements[i + 1].clone())); - i += 2; - } - replace::replace_mode(&input_file, &output_file, replacement_pairs) + let replacement_pairs: Vec<(String, String)> = replacements + .chunks(2) + .map(|chunk| (chunk[0].clone(), chunk[1].clone())) + .collect(); + replace::replace_pii(&input_file, &output_file, &replacement_pairs) } } } @@ -86,22 +85,22 @@ mod tests { #[test] fn test_detect_mode() -> Result<(), Box> { - let sample = "anonymizer_data/sample_statement.pdf"; - assert!(std::path::Path::new(sample).exists(), "Required test file missing: {}", sample); + let sample = std::path::Path::new("anonymizer_data/sample_statement.pdf"); + assert!(sample.exists(), "Required test file missing: {}", sample.display()); detect::detect_pii(sample)?; Ok(()) } #[test] - fn test_replace_mode() -> Result<(), Box> { - let sample = "anonymizer_data/sample_statement.pdf"; - let expected_pdf = "anonymizer_data/expected_statement.pdf"; + fn test_replace_pii() -> Result<(), Box> { + let sample = std::path::Path::new("anonymizer_data/sample_statement.pdf"); + let expected_pdf = std::path::Path::new("anonymizer_data/expected_statement.pdf"); let output_dir = "target/test_outputs"; - let output_pdf = "target/test_outputs/out_sample_statement.pdf"; + let output_pdf = std::path::Path::new("target/test_outputs/out_sample_statement.pdf"); - assert!(std::path::Path::new(sample).exists(), "Required test file missing: {}", sample); - assert!(std::path::Path::new(expected_pdf).exists(), "Required test file missing: {}", expected_pdf); + assert!(sample.exists(), "Required test file missing: {}", sample.display()); + assert!(expected_pdf.exists(), "Required test file missing: {}", expected_pdf.display()); fs::create_dir_all(output_dir)?; @@ -113,7 +112,7 @@ mod tests { ("012-345678-910".to_string(), "XXXXXXXXXXXXXX".to_string()), ]; - replace::replace_mode(sample, output_pdf, replacements)?; + replace::replace_pii(sample, output_pdf, &replacements)?; let produced = fs::read(output_pdf)?; let expected = fs::read(expected_pdf)?; diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index 4a94a8d..6050792 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -44,14 +44,12 @@ impl DetectionResult { /// The function inspects FlateDecode streams, extracts text tokens and heuristically /// determines name/address/account tokens. It prints a single `replace` command /// suitable for shell use. -pub fn detect_pii>(input_path: P) -> Result<(), Box> { - let input_path_ref = input_path.as_ref(); - let pdf_data = read_pdf(input_path_ref)?; +pub fn detect_pii(input_path: &std::path::Path) -> Result<(), Box> { + let pdf_data = read_pdf(input_path)?; - // let obj_re = Regex::new(OBJ_STREAM_RE).unwrap(); // Removed old regex initialization let mut result = DetectionResult::default(); let config = DetectionConfig::default(); - + for stream in stream_scanner(&pdf_data) { if !stream.valid_end_marker { warn!( @@ -77,43 +75,43 @@ pub fn detect_pii>(input_path: P) -> Result<(), Box = Vec::new(); let mut inserted = std::collections::HashSet::new(); - if let Some(id) = result.id.as_ref() { + if let Some(id) = &result.id { if inserted.insert(id.clone()) { final_texts.push(id.clone()); } } - if let Some(n) = result.name.as_ref() { + if let Some(n) = &result.name { if inserted.insert(n.clone()) { final_texts.push(n.clone()); } } - if let Some(a1) = result.address_line1.as_ref() { + if let Some(a1) = &result.address_line1 { if inserted.insert(a1.clone()) { final_texts.push(a1.clone()); } } - if let Some(a2) = result.address_line2.as_ref() { + if let Some(a2) = &result.address_line2 { if inserted.insert(a2.clone()) { final_texts.push(a2.clone()); } } - if let Some(sp) = result.account_spaced.as_ref() { + if let Some(sp) = &result.account_spaced { if inserted.insert(sp.clone()) { final_texts.push(sp.clone()); } } - if let Some(ms) = result.account_ms.as_ref() { + if let Some(ms) = &result.account_ms { if inserted.insert(ms.clone()) { final_texts.push(ms.clone()); } } - print!("replace \"{}\" \"{}\"", input_path_ref.display(), out_path.display()); + print!("replace \"{}\" \"{}\"", input_path.display(), out_path.display()); for txt in &final_texts { let replacement = "X".repeat(txt.len()); print!(" \"{}\" \"{}\"", txt, replacement); @@ -325,7 +323,7 @@ fn handle_for_and_extract( // Validate account spaced vs non-spaced (compare digits-only) fn validate_account_match(result: &DetectionResult) { - if let (Some(spaced), Some(ms)) = (result.account_spaced.as_ref(), result.account_ms.as_ref()) { + if let (Some(spaced), Some(ms)) = (&result.account_spaced, &result.account_ms) { let digits_only = |s: &str| s.chars().filter(|c| c.is_numeric()).collect::(); let ds = digits_only(spaced); let dm = digits_only(ms); diff --git a/src/anonymizer/list.rs b/src/anonymizer/list.rs index a807266..7634f52 100644 --- a/src/anonymizer/list.rs +++ b/src/anonymizer/list.rs @@ -1,7 +1,9 @@ use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; -use log::{info,warn}; +use log::{info, warn}; +use std::error::Error; +use std::path::Path; -pub fn list_texts(input_path : &str) -> Result<(), Box> { +pub fn list_texts(input_path: &Path) -> Result<(), Box> { let pdf_data = read_pdf(input_path)?; let mut global_text_id = 0; diff --git a/src/anonymizer/path.rs b/src/anonymizer/path.rs index 63ba66e..4309f9f 100644 --- a/src/anonymizer/path.rs +++ b/src/anonymizer/path.rs @@ -3,9 +3,7 @@ use std::path::PathBuf; /// Build an output path by prefixing the input filename with `anonymous_`. /// /// Preserves the parent directory if present and returns a `PathBuf`. -pub(crate) fn anonymous_output_path>(in_path: P) -> PathBuf { - let input_path = in_path.as_ref(); - +pub(crate) fn anonymous_output_path(input_path: &std::path::Path) -> PathBuf { let file_name = input_path .file_name() .map(|s| s.to_string_lossy().into_owned()) @@ -26,21 +24,21 @@ mod tests { #[test] fn test_anonymous_output_path_no_parent() { - let in_path = "statement.pdf"; + let in_path = std::path::Path::new("statement.pdf"); let out = anonymous_output_path(in_path); assert_eq!(out, std::path::PathBuf::from("anonymous_statement.pdf")); } #[test] fn test_anonymous_output_path_with_parent() { - let in_path = "some/dir/statement.pdf"; + let in_path = std::path::Path::new("some/dir/statement.pdf"); let out = anonymous_output_path(in_path); assert_eq!(out, std::path::PathBuf::from("some/dir/anonymous_statement.pdf")); } #[test] fn test_anonymous_output_path_unicode_filename() { - let in_path = "résumé.pdf"; + let in_path = std::path::Path::new("résumé.pdf"); let out = anonymous_output_path(in_path); assert_eq!(out, std::path::PathBuf::from("anonymous_résumé.pdf")); } diff --git a/src/anonymizer/pdf.rs b/src/anonymizer/pdf.rs index adc6b36..7fe5da5 100644 --- a/src/anonymizer/pdf.rs +++ b/src/anonymizer/pdf.rs @@ -17,15 +17,14 @@ pub(crate) const PDF_HEADER: &[u8] = b"%PDF-1.3\n"; pub(crate) const OBJ_STREAM_RE: &str = r"(?s)\d+\s+\d+\s+obj\s*<<\s*/Length\s+(\d+)\s*/Filter\s*\[\s*/FlateDecode\s*\]\s*>>\s*stream\n"; /// Read entire PDF file and validate strict header. -pub(crate) fn read_pdf>(path: P) -> Result, Box> { - let path_ref = path.as_ref(); - let mut file = File::open(path_ref)?; +pub(crate) fn read_pdf(path: &std::path::Path) -> Result, Box> { + let mut file = File::open(path)?; let mut pdf_data = Vec::new(); file.read_to_end(&mut pdf_data)?; if pdf_data.len() < PDF_HEADER.len() || &pdf_data[0..PDF_HEADER.len()] != PDF_HEADER { error!( "Unsupported PDF version or invalid PDF header at '{}'.", - path_ref.display() + path.display() ); return Err("Invalid PDF header".into()); } diff --git a/src/anonymizer/replace.rs b/src/anonymizer/replace.rs index 00aaa52..51d777e 100644 --- a/src/anonymizer/replace.rs +++ b/src/anonymizer/replace.rs @@ -2,22 +2,21 @@ use super::pdf::{process_stream, read_pdf, stream_scanner}; use log::{debug, info, warn}; use std::fs::File; use std::io::Write; +use std::path::Path; /// Replace occurrences of given `replacements` inside FlateDecode streams of `input_path` and /// write modified PDF to `output_path`. /// /// Each replacement is a `(original, replacement)` pair. Compression is retried to fit /// the original stream size; if impossible, the original compressed stream is preserved. -pub(crate) fn replace_mode>( - input_path: P, - output_path: P, - replacements: Vec<(String, String)>, +pub(crate) fn replace_pii( + input_path: &Path, + output_path: &Path, + replacements: &[(String, String)], ) -> Result<(), Box> { - let input_path_ref = input_path.as_ref(); - let output_path_ref = output_path.as_ref(); - info!("Loading: {}", input_path_ref.display()); + info!("Loading: {}", input_path.display()); - let pdf_data = match read_pdf(input_path_ref) { + let pdf_data = match read_pdf(input_path) { Ok(d) => d, Err(_) => return Ok(()), // header or open error already logged }; @@ -54,7 +53,7 @@ pub(crate) fn replace_mode>( compressed_data.len() ); let (new_compressed_data, stream_replacement_counts) = - process_stream(compressed_data, &replacements)?; + process_stream(compressed_data, replacements)?; // aggregate counts from this stream let mut stream_total = 0usize; if !stream_replacement_counts.is_empty() { @@ -91,8 +90,8 @@ pub(crate) fn replace_mode>( } } - info!("Saving: {}", output_path_ref.display()); - File::create(output_path_ref)?.write_all(&output_data)?; + info!("Saving: {}", output_path.display()); + File::create(output_path)?.write_all(&output_data)?; info!("DONE!"); info!( @@ -107,7 +106,7 @@ pub(crate) fn replace_mode>( info!(" '{}' -> {}", k, v); } } - info!("File: {}", output_path_ref.display()); + info!("File: {}", output_path.display()); Ok(()) } From 6aa3dba5b7ae7dc370ea6670a4a545d6ded6fdad Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 12 Dec 2025 21:40:21 +0100 Subject: [PATCH 10/21] add anchor for recipient data --- src/anonymizer/detect.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index 6050792..d36b177 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -6,18 +6,22 @@ pub(crate) struct DetectionConfig { pub anchor_ms_account: &'static str, pub anchor_for_period: &'static str, pub anchor_for_name: &'static str, + pub anchor_for_recipient_data: &'static str, } +// Find the first `to_be_redacted = anchor + offset`. Replace all `to_be_redacted` impl Default for DetectionConfig { fn default() -> Self { Self { - anchor_ms_account: "Morgan Stanley at Work Self-Directed Account", - anchor_for_period: "For the Period", - anchor_for_name: "FOR:", + anchor_ms_account: "Morgan Stanley at Work Self-Directed Account", // +1 + anchor_for_period: "For the Period", // +3 + anchor_for_name: "FOR:", // +1 + anchor_for_recipient_data: "E*TRADE is a business of Morgan Stanley.", // +1, +2, +3, +4 } } } + #[derive(Default, Debug)] pub(crate) struct DetectionResult { id: Option, From c194f7b8d14deddbecc6c031f88939ce1eb846a9 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 12 Dec 2025 21:43:46 +0100 Subject: [PATCH 11/21] rename anchors --- src/anonymizer/detect.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index d36b177..fcf31c6 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -3,8 +3,8 @@ use log::{debug, info, warn}; use std::error::Error; pub(crate) struct DetectionConfig { - pub anchor_ms_account: &'static str, - pub anchor_for_period: &'static str, + pub anchor_for_account: &'static str, + pub anchor_for_account_spaced: &'static str, pub anchor_for_name: &'static str, pub anchor_for_recipient_data: &'static str, } @@ -13,8 +13,8 @@ pub(crate) struct DetectionConfig { impl Default for DetectionConfig { fn default() -> Self { Self { - anchor_ms_account: "Morgan Stanley at Work Self-Directed Account", // +1 - anchor_for_period: "For the Period", // +3 + anchor_for_account: "Morgan Stanley at Work Self-Directed Account", // +1 + anchor_for_account_spaced: "For the Period", // +3 anchor_for_name: "FOR:", // +1 anchor_for_recipient_data: "E*TRADE is a business of Morgan Stanley.", // +1, +2, +3, +4 } @@ -155,7 +155,7 @@ fn find_account_after_anchor_in_stream( && result.address_line2.is_some() && result.account_ms.is_none() { - let anchor_text = config.anchor_ms_account; + let anchor_text = config.anchor_for_account; for (idx, t) in extracted_texts.iter().enumerate() { if t.contains(anchor_text) { let mut next = idx + 1; @@ -185,7 +185,7 @@ fn find_spaced_account_and_start( ) -> usize { let mut for_search_start: usize = 0; for (i, txt) in extracted_texts.iter().enumerate() { - if txt.contains(config.anchor_for_period) && i + 3 < extracted_texts.len() { + if txt.contains(config.anchor_for_account_spaced) && i + 3 < extracted_texts.len() { let account_full = extracted_texts[i + 3].clone(); let account = account_full.as_str(); if account.contains(" - ") && account.chars().any(|c| c.is_numeric()) { @@ -291,7 +291,7 @@ fn handle_for_and_extract( if result.address_line1.is_some() && result.address_line2.is_some() { // First: look for the specific preceding anchor and take the next token. let mut found_via_anchor = false; - let anchor_text = config.anchor_ms_account; + let anchor_text = config.anchor_for_account; let mut anchor_idx = None; for idx in (anchor_index + look)..extracted_texts.len() { if extracted_texts[idx].contains(anchor_text) { From bd9e5bb4de9e11aa6456d25f8d9e667270c3b9cc Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 12 Dec 2025 22:00:53 +0100 Subject: [PATCH 12/21] nicely format anchors --- src/anonymizer/detect.rs | 63 ++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index fcf31c6..f02eeae 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -2,26 +2,51 @@ use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; use log::{debug, info, warn}; use std::error::Error; +struct AnchorOffsets { + text: &'static str, + offsets: &'static [usize], +} + pub(crate) struct DetectionConfig { - pub anchor_for_account: &'static str, - pub anchor_for_account_spaced: &'static str, - pub anchor_for_name: &'static str, - pub anchor_for_recipient_data: &'static str, + pub account: AnchorOffsets, + pub account_spaced: AnchorOffsets, + pub name: AnchorOffsets, + pub recipient_data: AnchorOffsets, } -// Find the first `to_be_redacted = anchor + offset`. Replace all `to_be_redacted` +// Find the first `to_be_redacted = anchor + offset`. Replace all `to_be_redacted` you can find impl Default for DetectionConfig { fn default() -> Self { Self { - anchor_for_account: "Morgan Stanley at Work Self-Directed Account", // +1 - anchor_for_account_spaced: "For the Period", // +3 - anchor_for_name: "FOR:", // +1 - anchor_for_recipient_data: "E*TRADE is a business of Morgan Stanley.", // +1, +2, +3, +4 + // [148] 012-345678-910 + account: AnchorOffsets { + text: "Morgan Stanley at Work Self-Directed Account", + offsets: &[1], + }, + // [10] 012 - 345678 - 910 - + account_spaced: AnchorOffsets { + text: "For the Period", + offsets: &[3], + }, + // [14] JAN KOWALSKI + name: AnchorOffsets { + text: "FOR:", + offsets: &[1], + }, + /* + [18] #BWNJGWM + [19] JAN KOWALSKI + [20] UL. SWIETOKRZYSKA 12 + [21] WARSAW 00-916 POLAND + */ + recipient_data: AnchorOffsets { + text: "E*TRADE is a business of Morgan Stanley.", + offsets: &[1, 2, 3, 4], + }, } } } - #[derive(Default, Debug)] pub(crate) struct DetectionResult { id: Option, @@ -53,7 +78,7 @@ pub fn detect_pii(input_path: &std::path::Path) -> Result<(), Box> { let mut result = DetectionResult::default(); let config = DetectionConfig::default(); - + for stream in stream_scanner(&pdf_data) { if !stream.valid_end_marker { warn!( @@ -115,7 +140,11 @@ pub fn detect_pii(input_path: &std::path::Path) -> Result<(), Box> { } } - print!("replace \"{}\" \"{}\"", input_path.display(), out_path.display()); + print!( + "replace \"{}\" \"{}\"", + input_path.display(), + out_path.display() + ); for txt in &final_texts { let replacement = "X".repeat(txt.len()); print!(" \"{}\" \"{}\"", txt, replacement); @@ -125,8 +154,6 @@ pub fn detect_pii(input_path: &std::path::Path) -> Result<(), Box> { Ok(()) } - - pub(crate) fn analyze_extracted_texts( extracted_texts: &[String], result: &mut DetectionResult, @@ -155,7 +182,7 @@ fn find_account_after_anchor_in_stream( && result.address_line2.is_some() && result.account_ms.is_none() { - let anchor_text = config.anchor_for_account; + let anchor_text = config.account.text; for (idx, t) in extracted_texts.iter().enumerate() { if t.contains(anchor_text) { let mut next = idx + 1; @@ -185,7 +212,7 @@ fn find_spaced_account_and_start( ) -> usize { let mut for_search_start: usize = 0; for (i, txt) in extracted_texts.iter().enumerate() { - if txt.contains(config.anchor_for_account_spaced) && i + 3 < extracted_texts.len() { + if txt.contains(config.account_spaced.text) && i + 3 < extracted_texts.len() { let account_full = extracted_texts[i + 3].clone(); let account = account_full.as_str(); if account.contains(" - ") && account.chars().any(|c| c.is_numeric()) { @@ -213,7 +240,7 @@ fn handle_for_and_extract( config: &DetectionConfig, ) { for (i, txt) in extracted_texts.iter().enumerate().skip(start) { - if txt.contains(config.anchor_for_name) && i + 1 < extracted_texts.len() { + if txt.contains(config.name.text) && i + 1 < extracted_texts.len() { let name_full = extracted_texts[i + 1].clone(); let name = name_full.as_str(); if !name.is_empty() { @@ -291,7 +318,7 @@ fn handle_for_and_extract( if result.address_line1.is_some() && result.address_line2.is_some() { // First: look for the specific preceding anchor and take the next token. let mut found_via_anchor = false; - let anchor_text = config.anchor_for_account; + let anchor_text = config.account.text; let mut anchor_idx = None; for idx in (anchor_index + look)..extracted_texts.len() { if extracted_texts[idx].contains(anchor_text) { From 090829c4afbd2269a54585f9df9c82083f746ec7 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 12 Dec 2025 23:07:08 +0100 Subject: [PATCH 13/21] Use just AnchorOffset struct --- src/anonymizer/detect.rs | 115 ++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index f02eeae..259918b 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -2,47 +2,40 @@ use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; use log::{debug, info, warn}; use std::error::Error; -struct AnchorOffsets { - text: &'static str, - offsets: &'static [usize], +pub(crate) struct AnchorOffset { + pub text: &'static str, + pub offset: usize, } pub(crate) struct DetectionConfig { - pub account: AnchorOffsets, - pub account_spaced: AnchorOffsets, - pub name: AnchorOffsets, - pub recipient_data: AnchorOffsets, + pub account: AnchorOffset, + pub account_spaced: AnchorOffset, + pub name: AnchorOffset, + pub recipient_code: AnchorOffset, + pub recipient_address_line1: AnchorOffset, + pub recipient_address_line2: AnchorOffset, } -// Find the first `to_be_redacted = anchor + offset`. Replace all `to_be_redacted` you can find +// Find the first `to_be_redacted = anchor + offset`. Replace all `to_be_redacted` you can find. For most sensitive data. impl Default for DetectionConfig { fn default() -> Self { Self { // [148] 012-345678-910 - account: AnchorOffsets { - text: "Morgan Stanley at Work Self-Directed Account", - offsets: &[1], - }, + account: AnchorOffset { text: "Morgan Stanley at Work Self-Directed Account", offset: 1 }, // [10] 012 - 345678 - 910 - - account_spaced: AnchorOffsets { - text: "For the Period", - offsets: &[3], - }, + account_spaced: AnchorOffset { text: "For the Period", offset: 3 }, // [14] JAN KOWALSKI - name: AnchorOffsets { - text: "FOR:", - offsets: &[1], - }, + name: AnchorOffset { text: "FOR:", offset: 1 }, /* - [18] #BWNJGWM + [18] #ABCDEFG [19] JAN KOWALSKI [20] UL. SWIETOKRZYSKA 12 [21] WARSAW 00-916 POLAND */ - recipient_data: AnchorOffsets { - text: "E*TRADE is a business of Morgan Stanley.", - offsets: &[1, 2, 3, 4], - }, + // recipient tokens follow the same anchor; offsets are 1, 3, 4 + recipient_code: AnchorOffset { text: "E*TRADE is a business of Morgan Stanley.", offset: 1 }, + recipient_address_line1: AnchorOffset { text: "E*TRADE is a business of Morgan Stanley.", offset: 3 }, + recipient_address_line2: AnchorOffset { text: "E*TRADE is a business of Morgan Stanley.", offset: 4 }, } } } @@ -185,18 +178,19 @@ fn find_account_after_anchor_in_stream( let anchor_text = config.account.text; for (idx, t) in extracted_texts.iter().enumerate() { if t.contains(anchor_text) { - let mut next = idx + 1; - while next < extracted_texts.len() { - let cand_full = &extracted_texts[next]; - if !cand_full.is_empty() { + // use the configured offset for account token + let off = config.account.offset; + let account_idx = idx + off; + if account_idx < extracted_texts.len() { + let account_candidate = &extracted_texts[account_idx]; + if !account_candidate.is_empty() { info!( - "Found account number after anchor (later stream): {}", - cand_full + "Found account number after anchor at offset {}: {}", + off, account_candidate ); - result.account_ms = Some(cand_full.clone()); + result.account_ms = Some(account_candidate.clone()); return true; } - next += 1; } } } @@ -212,20 +206,22 @@ fn find_spaced_account_and_start( ) -> usize { let mut for_search_start: usize = 0; for (i, txt) in extracted_texts.iter().enumerate() { - if txt.contains(config.account_spaced.text) && i + 3 < extracted_texts.len() { - let account_full = extracted_texts[i + 3].clone(); - let account = account_full.as_str(); - if account.contains(" - ") && account.chars().any(|c| c.is_numeric()) { - info!( - "Found account number (with spaces) after 'For the Period': {}", - account - ); - if result.account_spaced.is_none() { + if txt.contains(config.account_spaced.text) { + // use the configured offset for spaced account token + let offset = config.account_spaced.offset; + if i + offset < extracted_texts.len() { + let account_full = extracted_texts[i + offset].clone(); + let account = account_full.as_str(); + if account.contains(" - ") && account.chars().any(|c| c.is_numeric()) { + info!( + "Found account number (with spaces) after 'For the Period': {}", + account + ); result.account_spaced = Some(account_full.clone()); + // start FOR: search after the account token (offset + 1) + for_search_start = i + offset + 1; + break; } - // start FOR: search after the account token - for_search_start = i + 4; // i+3 is account token, so start after - break; } } } @@ -240,8 +236,13 @@ fn handle_for_and_extract( config: &DetectionConfig, ) { for (i, txt) in extracted_texts.iter().enumerate().skip(start) { - if txt.contains(config.name.text) && i + 1 < extracted_texts.len() { - let name_full = extracted_texts[i + 1].clone(); + if txt.contains(config.name.text) { + // name offset: where the actual name token is relative to the FOR: anchor + let name_offset = config.name.offset; + if i + name_offset >= extracted_texts.len() { + continue; + } + let name_full = extracted_texts[i + name_offset].clone(); let name = name_full.as_str(); if !name.is_empty() { let mut ctx: Vec = Vec::new(); @@ -261,7 +262,7 @@ fn handle_for_and_extract( // Deterministic rule: unconditionally capture the next two non-empty tokens after the name. // Prefer a later occurrence of the same name (some PDFs repeat the name and the address appears after the second occurrence). - let mut anchor_index = i + 1; // default: position of the name after FOR: + let mut anchor_index = i + name_offset; // default: position of the name after FOR: for k in (i + 2)..extracted_texts.len() { if extracted_texts[k].contains(&name_full) { anchor_index = k; @@ -328,19 +329,19 @@ fn handle_for_and_extract( } if let Some(ai) = anchor_idx { - let mut next = ai + 1; - while next < extracted_texts.len() { - let cand_full = extracted_texts[next].clone(); - if !cand_full.is_empty() { + // use configured offset relative to found anchor + let off = config.account.offset; + let account_idx = ai + off; + if account_idx < extracted_texts.len() { + let account_candidate = extracted_texts[account_idx].clone(); + if !account_candidate.is_empty() { info!( - "Found account number after anchor '{}' : {}", - anchor_text, cand_full + "Found account number after anchor '{}' at offset {}: {}", + anchor_text, off, account_candidate ); - result.account_ms = Some(cand_full.clone()); + result.account_ms = Some(account_candidate.clone()); found_via_anchor = true; - break; } - next += 1; } } From ed5e43e1425c59059d8ccde2f0a32bdf456c4da0 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 12 Dec 2025 23:07:35 +0100 Subject: [PATCH 14/21] Cargo.lock --- Cargo.lock | 182 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 121 insertions(+), 61 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0a0813e..7b0c369 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -92,10 +92,54 @@ dependencies = [ ] [[package]] -name = "ansi_term" -version = "0.9.0" +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23ac7c30002a5accbf7e8987d0632fa6de155b7c3d39d0067317a391e00a2ef6" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.0", +] [[package]] name = "argminmax" @@ -137,17 +181,6 @@ version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ae037714f313c1353189ead58ef9eec30a8e8dc101b2622d461418fd59e28a9" -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.5.0" @@ -175,12 +208,6 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" -[[package]] -name = "bitflags" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4efd02e230a02e18f92fc2735f44597385ed02ad8f831e7c1c1156ee5e1ab3a5" - [[package]] name = "bitflags" version = "1.3.2" @@ -313,19 +340,44 @@ dependencies = [ [[package]] name = "clap" -version = "2.27.1" +version = "4.5.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8c532887f1a292d17de05ae858a8fe50a301e196f9ef0ddb7ccd0d1d00f180" +checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00" dependencies = [ - "ansi_term", - "atty", - "bitflags 0.9.1", + "anstream", + "anstyle", + "clap_lex", "strsim", - "textwrap", - "unicode-width 0.1.14", - "vec_map", ] +[[package]] +name = "clap_derive" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "clap_lex" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" + [[package]] name = "cmake" version = "0.1.54" @@ -344,6 +396,12 @@ dependencies = [ "encoding_rs", ] +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + [[package]] name = "colored" version = "2.2.0" @@ -362,7 +420,7 @@ checksum = "b03b7db8e0b4b2fdad6c551e634134e99ec000e5c8c3b6856c65e8bbaded7a3b" dependencies = [ "crossterm", "unicode-segmentation", - "unicode-width 0.2.1", + "unicode-width", ] [[package]] @@ -602,6 +660,7 @@ dependencies = [ "chrono", "clap", "csv", + "flate2", "fltk", "holidays", "log", @@ -657,9 +716,9 @@ checksum = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d" [[package]] name = "flate2" -version = "1.1.2" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "miniz_oxide", @@ -861,13 +920,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] -name = "hermit-abi" -version = "0.1.19" +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "holidays" @@ -1127,6 +1183,12 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.10.5" @@ -1279,6 +1341,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -1399,6 +1462,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "opaque-debug" version = "0.3.1" @@ -2193,6 +2262,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + [[package]] name = "simdutf8" version = "0.1.5" @@ -2321,9 +2396,9 @@ dependencies = [ [[package]] name = "strsim" -version = "0.6.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4d15c810519a91cf877e7e36e63fe068815c678181439f2f29e2562147c3694" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum_macros" @@ -2331,7 +2406,7 @@ version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -2431,15 +2506,6 @@ dependencies = [ "windows-sys 0.61.0", ] -[[package]] -name = "textwrap" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0b59b6b4b44d867f1370ef1bd91bfb262bf07bf0ae65c202ea2fbc16153b693" -dependencies = [ - "unicode-width 0.1.14", -] - [[package]] name = "thiserror" version = "1.0.69" @@ -2633,12 +2699,6 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" -[[package]] -name = "unicode-width" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" - [[package]] name = "unicode-width" version = "0.2.1" @@ -2664,16 +2724,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] -name = "vcpkg" -version = "0.2.15" +name = "utf8parse" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] -name = "vec_map" -version = "0.8.2" +name = "vcpkg" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "version_check" From d78ccafd79634eb8b10216f4a14ae0859750482d Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 12 Dec 2025 23:07:57 +0100 Subject: [PATCH 15/21] add todo to readme. remove ref to screenshots --- src/anonymizer/README.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/anonymizer/README.md b/src/anonymizer/README.md index fa4fef5..aff0979 100644 --- a/src/anonymizer/README.md +++ b/src/anonymizer/README.md @@ -83,11 +83,14 @@ publishing. The maintainers make reasonable efforts to identify the following ca - Mailing address (two lines) - Account number -These are the only PII categories we explicitly target. +These are the only PII categories we explicitly target for now. -We provide example screenshots showing the text tokens we look for and recommend -verifying manually: +# Todo +## Detect +- Change the logic so that the search for each AnchorOffset happens from the beginning. This makes algorithm simple, allows for easy configurtion change. It's more maintainable. +- Add note that Detect step is only for most sensitive data? -![Detected tokens — first page](../../../assets/first_page.png) - -![Detected tokens — third page](../../../assets/third_page.png) \ No newline at end of file +## Replace +It's not valid to remove every occurance of Beginning Total Value (the amount in dollars) from the pdf because the same value might appear in section used for calculation. +- Keep the replace logic for most sensitive data. +- Introduce some additional data format for removing financial data (less sensitive). From 816af4d1365fcd3b2f999b62f7afe52f5715f5e6 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 12 Dec 2025 23:14:33 +0100 Subject: [PATCH 16/21] bring back accidentaly removed line from REUSE.toml --- REUSE.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/REUSE.toml b/REUSE.toml index 8c8c496..725c37d 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -21,6 +21,7 @@ path = [ "data/G&L_Expanded_polish.xlsx", "data/ecb_example_response.xml", "revolut_data/*.csv", + "revolut_data/*.tsv", "anonymizer_data/*", ] SPDX-FileCopyrightText = "2025 RustInFinance" From 295d856bb8a2a52d4526d9092637f7be46086228 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Sat, 13 Dec 2025 20:21:27 +0100 Subject: [PATCH 17/21] simplifying detect logic Change the logic so that the search for each AnchorOffset happens from the beginning. This makes algorithm simple, allows for easy configurtion change. It's more maintainable. --- src/anonymizer/detect.rs | 152 ++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 66 deletions(-) diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index 259918b..541fa34 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -21,11 +21,20 @@ impl Default for DetectionConfig { fn default() -> Self { Self { // [148] 012-345678-910 - account: AnchorOffset { text: "Morgan Stanley at Work Self-Directed Account", offset: 1 }, + account: AnchorOffset { + text: "Morgan Stanley at Work Self-Directed Account", + offset: 1, + }, // [10] 012 - 345678 - 910 - - account_spaced: AnchorOffset { text: "For the Period", offset: 3 }, + account_spaced: AnchorOffset { + text: "For the Period", + offset: 3, + }, // [14] JAN KOWALSKI - name: AnchorOffset { text: "FOR:", offset: 1 }, + name: AnchorOffset { + text: "FOR:", + offset: 1, + }, /* [18] #ABCDEFG [19] JAN KOWALSKI @@ -33,9 +42,18 @@ impl Default for DetectionConfig { [21] WARSAW 00-916 POLAND */ // recipient tokens follow the same anchor; offsets are 1, 3, 4 - recipient_code: AnchorOffset { text: "E*TRADE is a business of Morgan Stanley.", offset: 1 }, - recipient_address_line1: AnchorOffset { text: "E*TRADE is a business of Morgan Stanley.", offset: 3 }, - recipient_address_line2: AnchorOffset { text: "E*TRADE is a business of Morgan Stanley.", offset: 4 }, + recipient_code: AnchorOffset { + text: "E*TRADE is a business of Morgan Stanley.", + offset: 1, + }, + recipient_address_line1: AnchorOffset { + text: "E*TRADE is a business of Morgan Stanley.", + offset: 3, + }, + recipient_address_line2: AnchorOffset { + text: "E*TRADE is a business of Morgan Stanley.", + offset: 4, + }, } } } @@ -156,46 +174,44 @@ pub(crate) fn analyze_extracted_texts( for (i, txt) in extracted_texts.iter().enumerate() { debug!(" [{}] {}", i, txt); } - // Run the composed helpers (implemented as top-level private helpers) - if find_account_after_anchor_in_stream(extracted_texts, result, config) { - return; + + // Per README TODO: search for each AnchorOffset from the beginning. + // 1) Find spaced account anywhere in the stream + let _start = find_spaced_account_and_start(extracted_texts, result, config); + + // 2) Run FOR: name/address extraction from the beginning + handle_for_and_extract(extracted_texts, 0, result, config); + + // 3) Populate recipient/address/account fields directly from anchors if not already found + if result.address_line1.is_none() { + result.address_line1 = get_string_by_anchor(&config.recipient_address_line1, extracted_texts); + } + if result.address_line2.is_none() { + result.address_line2 = get_string_by_anchor(&config.recipient_address_line2, extracted_texts); } - let for_search_start = find_spaced_account_and_start(extracted_texts, result, config); - handle_for_and_extract(extracted_texts, for_search_start, result, config); + if result.account_spaced.is_none() { + result.account_spaced = get_string_by_anchor(&config.account_spaced, extracted_texts); + } + if result.account_ms.is_none() { + result.account_ms = get_string_by_anchor(&config.account, extracted_texts); + } + validate_account_match(result); } -// helper: if address lines already known, look for the anchor in this stream and pick following token -fn find_account_after_anchor_in_stream( +fn get_string_by_anchor( + anchor_offset: &AnchorOffset, extracted_texts: &[String], - result: &mut DetectionResult, - config: &DetectionConfig, -) -> bool { - if result.address_line1.is_some() - && result.address_line2.is_some() - && result.account_ms.is_none() - { - let anchor_text = config.account.text; - for (idx, t) in extracted_texts.iter().enumerate() { - if t.contains(anchor_text) { - // use the configured offset for account token - let off = config.account.offset; - let account_idx = idx + off; - if account_idx < extracted_texts.len() { - let account_candidate = &extracted_texts[account_idx]; - if !account_candidate.is_empty() { - info!( - "Found account number after anchor at offset {}: {}", - off, account_candidate - ); - result.account_ms = Some(account_candidate.clone()); - return true; - } - } +) -> Option { + for (idx, t) in extracted_texts.iter().enumerate() { + if t.contains(anchor_offset.text) { + let target_idx = idx + anchor_offset.offset; + if target_idx < extracted_texts.len() { + return Some(extracted_texts[target_idx].clone()); } } } - false + None } // look for spaced account after "For the Period" and return start index for FOR: scanning @@ -378,36 +394,40 @@ mod tests { use super::*; #[test] - fn test_find_spaced_account_after_for_period() { - // Simulate a small token stream that might appear near the account header - let tokens = vec![ - "Account Summary".to_string(), - "For the Period September 1".to_string(), - "-".to_string(), - "30, 2025".to_string(), - "123 - 456789 - 012".to_string(), - ]; - let mut res = DetectionResult::default(); - let config = DetectionConfig::default(); - analyze_extracted_texts(&tokens, &mut res, &config); - assert_eq!(res.account_spaced, Some("123 - 456789 - 012".to_string())); - } - - #[test] - fn test_for_name_and_address_extraction_and_anchor_account() { + fn test_analyze_extracted_texts() { // Realistic token stream: FOR: name, address tokens, then account anchor and number - let tokens = vec![ - "FOR:".to_string(), - "John Doe".to_string(), - "123 Market St".to_string(), - "Cityville 12345".to_string(), - "Account Details".to_string(), - "Morgan Stanley at Work Self-Directed Account".to_string(), - "987654321".to_string(), - ]; - let mut res = DetectionResult::default(); + let tokens: Vec = [ + "Beginning Total Value ", + "$", + "12,345.67", + "Ending Total Value ", + "$1.23", + "Includes Accrued Interest", + "CLIENT STATEMENT ", + "For the Period September 1", + "-", + "30, 2025", + "012 - 345678 - 910 -", + "4 - 1", + "STATEMENT", + " FOR:", + "John Doe", + "", + "Morgan Stanley Smith Barney LLC. Member SIPC.", + "E*TRADE is a business of Morgan Stanley.", + "#ABCDEFG", + "John Doe", + "123 Market St", + "Cityville 12345 WHOKNOWS", + "Account Details", + "Morgan Stanley at Work Self-Directed Account", + "987654321", + ] + .iter() + .map(|s| s.to_string()) + .collect(); let config = DetectionConfig::default(); - analyze_extracted_texts(&tokens, &mut res, &config); + let res = analyze_extracted_texts(&tokens, &config); assert_eq!(res.name, Some("John Doe".to_string())); assert_eq!(res.address_line1, Some("123 Market St".to_string())); assert_eq!(res.address_line2, Some("Cityville 12345".to_string())); From 8d0f3700be37f00f4289971e5b41de7e3c6d76f3 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Sat, 13 Dec 2025 20:47:15 +0100 Subject: [PATCH 18/21] refactor detect logic for improved maintainability and configurability --- src/anonymizer/README.md | 1 - src/anonymizer/detect.rs | 296 ++++++++++++++------------------------- 2 files changed, 103 insertions(+), 194 deletions(-) diff --git a/src/anonymizer/README.md b/src/anonymizer/README.md index aff0979..9842ce9 100644 --- a/src/anonymizer/README.md +++ b/src/anonymizer/README.md @@ -87,7 +87,6 @@ These are the only PII categories we explicitly target for now. # Todo ## Detect -- Change the logic so that the search for each AnchorOffset happens from the beginning. This makes algorithm simple, allows for easy configurtion change. It's more maintainable. - Add note that Detect step is only for most sensitive data? ## Replace diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index 541fa34..c9dd3a3 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -87,8 +87,8 @@ impl DetectionResult { pub fn detect_pii(input_path: &std::path::Path) -> Result<(), Box> { let pdf_data = read_pdf(input_path)?; - let mut result = DetectionResult::default(); let config = DetectionConfig::default(); + let mut result = DetectionResult::default(); for stream in stream_scanner(&pdf_data) { if !stream.valid_end_marker { @@ -100,9 +100,9 @@ pub fn detect_pii(input_path: &std::path::Path) -> Result<(), Box> { } match extract_texts_from_stream(stream.compressed) { Ok(extracted) => { - analyze_extracted_texts(&extracted, &mut result, &config); + result = analyze_extracted_texts(&extracted, &config); if result.all_found() { - debug!("All target PII categories found; stopping search early."); + debug!("All target PII categories found."); break; } } @@ -167,36 +167,77 @@ pub fn detect_pii(input_path: &std::path::Path) -> Result<(), Box> { pub(crate) fn analyze_extracted_texts( extracted_texts: &[String], - result: &mut DetectionResult, config: &DetectionConfig, -) { +) -> DetectionResult { debug!("Analyzing {} extracted tokens", extracted_texts.len()); for (i, txt) in extracted_texts.iter().enumerate() { debug!(" [{}] {}", i, txt); } - // Per README TODO: search for each AnchorOffset from the beginning. - // 1) Find spaced account anywhere in the stream - let _start = find_spaced_account_and_start(extracted_texts, result, config); - - // 2) Run FOR: name/address extraction from the beginning - handle_for_and_extract(extracted_texts, 0, result, config); - - // 3) Populate recipient/address/account fields directly from anchors if not already found - if result.address_line1.is_none() { - result.address_line1 = get_string_by_anchor(&config.recipient_address_line1, extracted_texts); + let id = get_string_by_anchor(&config.recipient_code, extracted_texts); + let name = get_string_by_anchor(&config.name, extracted_texts); + let address_line1 = get_string_by_anchor(&config.recipient_address_line1, extracted_texts); + let address_line2 = get_string_by_anchor(&config.recipient_address_line2, extracted_texts); + let account_spaced = get_string_by_anchor(&config.account_spaced, extracted_texts); + let account_ms = get_string_by_anchor(&config.account, extracted_texts); + // Log what we found or didn't find + if let Some(ref v) = id { + info!("Found recipient code: {}", v); + } else { + warn!( + "Recipient code not found via anchor: {}", + config.recipient_code.text + ); } - if result.address_line2.is_none() { - result.address_line2 = get_string_by_anchor(&config.recipient_address_line2, extracted_texts); + if let Some(ref v) = name { + info!("Found name: {}", v); + } else { + warn!("Name not found via anchor: {}", config.name.text); } - if result.account_spaced.is_none() { - result.account_spaced = get_string_by_anchor(&config.account_spaced, extracted_texts); + if let Some(ref v) = address_line1 { + info!("Found address_line1: {}", v); + } else { + warn!( + "Address line 1 not found via anchor: {}", + config.recipient_address_line1.text + ); } - if result.account_ms.is_none() { - result.account_ms = get_string_by_anchor(&config.account, extracted_texts); + if let Some(ref v) = address_line2 { + info!("Found address_line2: {}", v); + } else { + warn!( + "Address line 2 not found via anchor: {}", + config.recipient_address_line2.text + ); + } + if let Some(ref v) = account_spaced { + info!("Found spaced account: {}", v); + } else { + warn!( + "Spaced account not found via anchor: {}", + config.account_spaced.text + ); + } + if let Some(ref v) = account_ms { + info!("Found ms account: {}", v); + } else { + warn!("MS account not found via anchor: {}", config.account.text); } - validate_account_match(result); + let acct_validation = validate_account_match(&account_spaced, &account_ms); + match acct_validation { + Some(true) => info!("Account validation: MATCH"), + Some(false) => warn!("Account validation: MISMATCH"), + None => warn!("Account validation: SKIPPED (missing token)"), + } + DetectionResult { + id, + name, + address_line1, + address_line2, + account_spaced, + account_ms, + } } fn get_string_by_anchor( @@ -214,177 +255,19 @@ fn get_string_by_anchor( None } -// look for spaced account after "For the Period" and return start index for FOR: scanning -fn find_spaced_account_and_start( - extracted_texts: &[String], - result: &mut DetectionResult, - config: &DetectionConfig, -) -> usize { - let mut for_search_start: usize = 0; - for (i, txt) in extracted_texts.iter().enumerate() { - if txt.contains(config.account_spaced.text) { - // use the configured offset for spaced account token - let offset = config.account_spaced.offset; - if i + offset < extracted_texts.len() { - let account_full = extracted_texts[i + offset].clone(); - let account = account_full.as_str(); - if account.contains(" - ") && account.chars().any(|c| c.is_numeric()) { - info!( - "Found account number (with spaces) after 'For the Period': {}", - account - ); - result.account_spaced = Some(account_full.clone()); - // start FOR: search after the account token (offset + 1) - for_search_start = i + offset + 1; - break; - } - } - } - } - for_search_start -} - -// handle FOR: marker - extract name and next two non-empty tokens as address lines; attempt anchor-based ms account after -fn handle_for_and_extract( - extracted_texts: &[String], - start: usize, - result: &mut DetectionResult, - config: &DetectionConfig, -) { - for (i, txt) in extracted_texts.iter().enumerate().skip(start) { - if txt.contains(config.name.text) { - // name offset: where the actual name token is relative to the FOR: anchor - let name_offset = config.name.offset; - if i + name_offset >= extracted_texts.len() { - continue; - } - let name_full = extracted_texts[i + name_offset].clone(); - let name = name_full.as_str(); - if !name.is_empty() { - let mut ctx: Vec = Vec::new(); - for j in 0..4 { - if i + 1 + j < extracted_texts.len() { - ctx.push(extracted_texts[i + 1 + j].clone()); - } - } - info!( - "Found name after 'FOR:': {} -- context: {:?}", - name_full, ctx - ); - if result.name.is_none() { - result.name = Some(name_full.clone()); - } - } - - // Deterministic rule: unconditionally capture the next two non-empty tokens after the name. - // Prefer a later occurrence of the same name (some PDFs repeat the name and the address appears after the second occurrence). - let mut anchor_index = i + name_offset; // default: position of the name after FOR: - for k in (i + 2)..extracted_texts.len() { - if extracted_texts[k].contains(&name_full) { - anchor_index = k; - break; - } - } - - // If we found a later occurrence, check for ID immediately before it. - if anchor_index > i + 1 { - let id_candidate = &extracted_texts[anchor_index - 1]; - if !id_candidate.is_empty() { - info!("Found ID before name anchor: {}", id_candidate); - result.id = Some(id_candidate.clone()); - } - } - - let mut collected = 0; - let mut look = 1; // start looking after the anchor name - while collected < 2 && anchor_index + look < extracted_texts.len() { - let candidate_full = extracted_texts[anchor_index + look].clone(); - let candidate = candidate_full.as_str(); - look += 1; - if candidate.is_empty() { - continue; - } - - // Always capture the next two non-empty tokens as address lines. - collected += 1; - if collected == 1 { - info!( - "Captured address_line1 after name (anchor_index={}): {} -- token_index={}", - anchor_index, - candidate, - anchor_index + look - 1 - ); - if result.address_line1.is_none() { - result.address_line1 = Some(candidate_full.clone()); - } - } else { - info!( - "Captured address_line2 after name (anchor_index={}): {} -- token_index={}", - anchor_index, - candidate, - anchor_index + look - 1 - ); - if result.address_line2.is_none() { - result.address_line2 = Some(candidate_full.clone()); - } - } - } +// Validate account spaced vs non-spaced (compare digits-only). Logs a warning on mismatch. +fn validate_account_match(spaced: &Option, ms: &Option) -> Option { + let digits_only = |s: &str| s.chars().filter(|c| c.is_numeric()).collect::(); - // Immediately after capturing the two address lines, pick the first non-empty token - // that follows anchor - if result.address_line1.is_some() && result.address_line2.is_some() { - // First: look for the specific preceding anchor and take the next token. - let mut found_via_anchor = false; - let anchor_text = config.account.text; - let mut anchor_idx = None; - for idx in (anchor_index + look)..extracted_texts.len() { - if extracted_texts[idx].contains(anchor_text) { - anchor_idx = Some(idx); - break; - } - } - - if let Some(ai) = anchor_idx { - // use configured offset relative to found anchor - let off = config.account.offset; - let account_idx = ai + off; - if account_idx < extracted_texts.len() { - let account_candidate = extracted_texts[account_idx].clone(); - if !account_candidate.is_empty() { - info!( - "Found account number after anchor '{}' at offset {}: {}", - anchor_text, off, account_candidate - ); - result.account_ms = Some(account_candidate.clone()); - found_via_anchor = true; - } - } - } - - if found_via_anchor { - return; // found via anchor, we're done - } - } + match (spaced, ms) { + (Some(s), Some(m)) => { + let ds = digits_only(s); + let dm = digits_only(m); + Some(ds == dm) } - } -} - -// Validate account spaced vs non-spaced (compare digits-only) -fn validate_account_match(result: &DetectionResult) { - if let (Some(spaced), Some(ms)) = (&result.account_spaced, &result.account_ms) { - let digits_only = |s: &str| s.chars().filter(|c| c.is_numeric()).collect::(); - let ds = digits_only(spaced); - let dm = digits_only(ms); - if ds == dm { - info!( - "Validated account: spaced='{}' matches non-spaced='{}'", - spaced, ms - ); - } else { - warn!( - "Account mismatch: spaced='{}' vs non-spaced='{}' (digits: {} != {})", - spaced, ms, ds, dm - ); + _ => { + // One or both values missing; nothing to validate. + None } } } @@ -395,7 +278,7 @@ mod tests { #[test] fn test_analyze_extracted_texts() { - // Realistic token stream: FOR: name, address tokens, then account anchor and number + // Semi-realistic token stream let tokens: Vec = [ "Beginning Total Value ", "$", @@ -430,7 +313,34 @@ mod tests { let res = analyze_extracted_texts(&tokens, &config); assert_eq!(res.name, Some("John Doe".to_string())); assert_eq!(res.address_line1, Some("123 Market St".to_string())); - assert_eq!(res.address_line2, Some("Cityville 12345".to_string())); + assert_eq!( + res.address_line2, + Some("Cityville 12345 WHOKNOWS".to_string()) + ); assert_eq!(res.account_ms, Some("987654321".to_string())); } + + #[test] + fn test_validate_account_match_matching() { + let spaced = Some("012 - 345678 - 910 -".to_string()); + let ms = Some("012345678910".to_string()); + let res = validate_account_match(&spaced, &ms); + assert_eq!(res, Some(true)); + } + + #[test] + fn test_validate_account_match_mismatch() { + let spaced = Some("012 - 345678 - 910 -".to_string()); + let ms = Some("987654321".to_string()); + let res = validate_account_match(&spaced, &ms); + assert_eq!(res, Some(false)); + } + + #[test] + fn test_validate_account_match_missing() { + let spaced: Option = None; + let ms = Some("987654321".to_string()); + let res = validate_account_match(&spaced, &ms); + assert_eq!(res, None); + } } From a0dc3f4a23dc456ca6c66653ea6c466560847d59 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Sat, 13 Dec 2025 21:04:30 +0100 Subject: [PATCH 19/21] update license headers and enhance module documentation across the anonymizer --- src/anonymizer/README.md | 5 +++++ src/anonymizer/anonymizer.rs | 14 ++++++++++++-- src/anonymizer/detect.rs | 27 +++++++++++++++++++++++++++ src/anonymizer/list.rs | 19 +++++++++++++++++++ src/anonymizer/path.rs | 16 ++++++++++++++++ src/anonymizer/pdf.rs | 9 ++++++--- src/anonymizer/replace.rs | 15 +++++++++++++++ 7 files changed, 100 insertions(+), 5 deletions(-) diff --git a/src/anonymizer/README.md b/src/anonymizer/README.md index 9842ce9..e1190da 100644 --- a/src/anonymizer/README.md +++ b/src/anonymizer/README.md @@ -1,3 +1,8 @@ + + # etradeAnonymizer Minimal Rust tool for: diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs index d79076a..d4247d1 100644 --- a/src/anonymizer/anonymizer.rs +++ b/src/anonymizer/anonymizer.rs @@ -1,6 +1,16 @@ -// SPDX-FileCopyrightText: 2024-2025 RustInFinance +// SPDX-FileCopyrightText: 2025 RustInFinance // SPDX-License-Identifier: BSD-3-Clause +//! etradeAnonymizer - PDF anonymization tool for E*TRADE / Morgan Stanley statements. +//! +//! This tool provides three subcommands: +//! - `list`: List all text tokens from FlateDecode streams in a PDF +//! - `detect`: Heuristically detect PII and print a replacement command +//! - `replace`: Apply explicit string replacements to PDF FlateDecode streams +//! +//! The tool operates on tightly structured PDF FlateDecode streams and preserves +//! the original file structure by performing in-place replacements with exact-size matching. + mod list; mod detect; mod path; @@ -39,7 +49,7 @@ enum Commands { input_file: PathBuf, /// Path to the output PDF file output_file: PathBuf, - /// Pairs of strings to replace: ... + /// Pairs of strings to replace: `"" "" "" "" ...` #[arg(required = true, num_args = 2..)] replacements: Vec, }, diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs index c9dd3a3..78b2673 100644 --- a/src/anonymizer/detect.rs +++ b/src/anonymizer/detect.rs @@ -1,12 +1,36 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! PII detection module for anonymizer. +//! +//! This module provides heuristic detection of personally identifiable information (PII) +//! in E*TRADE / Morgan Stanley PDF statement FlateDecode streams. It searches for: +//! - Recipient code (ID) +//! - Name +//! - Address lines (two lines) +//! - Account numbers (spaced and non-spaced formats) +//! +//! Detection is based on anchor text patterns and relative offsets within the token stream. +//! Once all PII categories are found, the module prints a `replace` command suitable for +//! shell invocation with the detected tokens. + use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; use log::{debug, info, warn}; use std::error::Error; +/// Configuration for locating a token via an anchor text and offset. +/// +/// The `text` field identifies an anchor string in the token stream, +/// and `offset` specifies how many tokens ahead the target token is located. pub(crate) struct AnchorOffset { pub text: &'static str, pub offset: usize, } +/// Detection configuration specifying anchor patterns for each PII category. +/// +/// Each field is an `AnchorOffset` that defines the anchor text and relative position +/// of the target PII token within the extracted text stream. pub(crate) struct DetectionConfig { pub account: AnchorOffset, pub account_spaced: AnchorOffset, @@ -58,6 +82,9 @@ impl Default for DetectionConfig { } } +/// Result of PII detection, holding detected tokens for each category. +/// +/// Fields are `None` if the corresponding PII was not found. #[derive(Default, Debug)] pub(crate) struct DetectionResult { id: Option, diff --git a/src/anonymizer/list.rs b/src/anonymizer/list.rs index 7634f52..25603e9 100644 --- a/src/anonymizer/list.rs +++ b/src/anonymizer/list.rs @@ -1,8 +1,27 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! Text listing module for anonymizer. +//! +//! This module provides functionality to extract and list all text tokens from +//! FlateDecode streams in a PDF. Each token is printed with a global index, +//! useful for understanding the structure and content of the PDF before detection/replacement. + use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; use log::{info, warn}; use std::error::Error; use std::path::Path; +/// List all text tokens from FlateDecode streams in the PDF at `input_path`. +/// +/// Prints each extracted token with a global index to stdout. +/// Logs warnings for streams that fail to decompress or have invalid markers. +/// +/// # Arguments +/// * `input_path` - Path to the input PDF file. +/// +/// # Returns +/// `Ok(())` on success, or an error if the PDF cannot be read. pub fn list_texts(input_path: &Path) -> Result<(), Box> { let pdf_data = read_pdf(input_path)?; diff --git a/src/anonymizer/path.rs b/src/anonymizer/path.rs index 4309f9f..a61e480 100644 --- a/src/anonymizer/path.rs +++ b/src/anonymizer/path.rs @@ -1,8 +1,24 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! Path utility module for anonymizer. +//! +//! Provides helper functions for generating anonymized output file paths +//! by prefixing filenames with `anonymous_` while preserving directory structure. + use std::path::PathBuf; /// Build an output path by prefixing the input filename with `anonymous_`. /// /// Preserves the parent directory if present and returns a `PathBuf`. +/// +/// # Examples +/// ```ignore +/// use std::path::Path; +/// let input = Path::new("data/statement.pdf"); +/// let output = anonymous_output_path(input); +/// assert_eq!(output, Path::new("data/anonymous_statement.pdf")); +/// ``` pub(crate) fn anonymous_output_path(input_path: &std::path::Path) -> PathBuf { let file_name = input_path .file_name() diff --git a/src/anonymizer/pdf.rs b/src/anonymizer/pdf.rs index 7fe5da5..b1e7dd7 100644 --- a/src/anonymizer/pdf.rs +++ b/src/anonymizer/pdf.rs @@ -1,14 +1,18 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + //! PDF parsing utilities: header validation, stream extraction, text token parsing. //! This module is intentionally strict and only supports a narrow subset of PDF //! objects used by the target documents: FlateDecode streams with explicit /Length. + use flate2::read::ZlibDecoder; use flate2::write::ZlibEncoder; use flate2::Compression; use log::{debug, error, info, warn}; use regex::bytes::Regex; +use std::error::Error; use std::fs::File; use std::io::{Read, Write}; -use std::error::Error; // Centralized constants and helpers for PDF parsing to reduce duplication between detect/replace. /// Expected PDF header (strictly enforced). @@ -182,8 +186,7 @@ pub(crate) fn extract_texts_from_stream( let mut decoder = ZlibDecoder::new(compressed_data); let mut decompressed = Vec::new(); decoder.read_to_end(&mut decompressed)?; - let text_re = - Regex::new(r"\(([^)]+)\)\s*Tj").map_err(|e| Box::new(e) as Box)?; + let text_re = Regex::new(r"\(([^)]+)\)\s*Tj").map_err(|e| Box::new(e) as Box)?; let mut extracted_texts: Vec = Vec::new(); for text_caps in text_re.captures_iter(&decompressed) { if let Some(txt) = text_caps.get(1) { diff --git a/src/anonymizer/replace.rs b/src/anonymizer/replace.rs index 51d777e..6a784b7 100644 --- a/src/anonymizer/replace.rs +++ b/src/anonymizer/replace.rs @@ -1,3 +1,18 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! String replacement module for anonymizer. +//! +//! This module applies specified text replacements to all FlateDecode streams in a PDF. +//! For each stream, the module: +//! 1. Decompresses the stream data +//! 2. Applies all specified string replacements +//! 3. Recompresses the modified text to the exact original size (with padding if necessary) +//! 4. Writes the modified PDF to the output file +//! +//! The in-place replacement strategy avoids rebuilding the PDF's XREF table, +//! ensuring the output PDF remains valid without full PDF structure parsing. + use super::pdf::{process_stream, read_pdf, stream_scanner}; use log::{debug, info, warn}; use std::fs::File; From 45fa8045a7ab35d1e5f38c084106d9e3bdb91398 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Sat, 13 Dec 2025 21:07:46 +0100 Subject: [PATCH 20/21] rustfmt --- src/anonymizer/anonymizer.rs | 39 ++++++++++++++++++++++++++++-------- src/anonymizer/list.rs | 8 ++++++-- src/anonymizer/path.rs | 5 ++++- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs index d4247d1..832dc87 100644 --- a/src/anonymizer/anonymizer.rs +++ b/src/anonymizer/anonymizer.rs @@ -11,8 +11,8 @@ //! The tool operates on tightly structured PDF FlateDecode streams and preserves //! the original file structure by performing in-place replacements with exact-size matching. -mod list; mod detect; +mod list; mod path; mod pdf; mod replace; @@ -74,7 +74,9 @@ fn main() -> Result<(), Box> { replacements, } => { if replacements.len() % 2 != 0 { - return Err("Replacements must be provided as pairs: ".into()); + return Err( + "Replacements must be provided as pairs: ".into(), + ); } let replacement_pairs: Vec<(String, String)> = replacements .chunks(2) @@ -96,7 +98,11 @@ mod tests { #[test] fn test_detect_mode() -> Result<(), Box> { let sample = std::path::Path::new("anonymizer_data/sample_statement.pdf"); - assert!(sample.exists(), "Required test file missing: {}", sample.display()); + assert!( + sample.exists(), + "Required test file missing: {}", + sample.display() + ); detect::detect_pii(sample)?; Ok(()) @@ -109,16 +115,33 @@ mod tests { let output_dir = "target/test_outputs"; let output_pdf = std::path::Path::new("target/test_outputs/out_sample_statement.pdf"); - assert!(sample.exists(), "Required test file missing: {}", sample.display()); - assert!(expected_pdf.exists(), "Required test file missing: {}", expected_pdf.display()); + assert!( + sample.exists(), + "Required test file missing: {}", + sample.display() + ); + assert!( + expected_pdf.exists(), + "Required test file missing: {}", + expected_pdf.display() + ); fs::create_dir_all(output_dir)?; let replacements = vec![ ("JAN KOWALSKI".to_string(), "XXXXXXXXXXXX".to_string()), - ("UL. SWIETOKRZYSKA 12".to_string(), "XXXXXXXXXXXXXXXXXXXX".to_string()), - ("WARSAW 00-916 POLAND".to_string(), "XXXXXXXXXXXXXXXXXXXX".to_string()), - ("012 - 345678 - 910 -".to_string(), "XXXXXXXXXXXXXXXXXXXX".to_string()), + ( + "UL. SWIETOKRZYSKA 12".to_string(), + "XXXXXXXXXXXXXXXXXXXX".to_string(), + ), + ( + "WARSAW 00-916 POLAND".to_string(), + "XXXXXXXXXXXXXXXXXXXX".to_string(), + ), + ( + "012 - 345678 - 910 -".to_string(), + "XXXXXXXXXXXXXXXXXXXX".to_string(), + ), ("012-345678-910".to_string(), "XXXXXXXXXXXXXX".to_string()), ]; diff --git a/src/anonymizer/list.rs b/src/anonymizer/list.rs index 25603e9..4931e02 100644 --- a/src/anonymizer/list.rs +++ b/src/anonymizer/list.rs @@ -36,10 +36,14 @@ pub fn list_texts(input_path: &Path) -> Result<(), Box> { } match extract_texts_from_stream(stream.compressed) { Ok(extracted_texts) => { - info!("stream {} has {} extracted tokens", stream_id, extracted_texts.len()); + info!( + "stream {} has {} extracted tokens", + stream_id, + extracted_texts.len() + ); for txt in extracted_texts.iter() { println!(" [{}] {}", global_text_id, txt); - global_text_id+=1; + global_text_id += 1; } } Err(e) => { diff --git a/src/anonymizer/path.rs b/src/anonymizer/path.rs index a61e480..cd94b05 100644 --- a/src/anonymizer/path.rs +++ b/src/anonymizer/path.rs @@ -49,7 +49,10 @@ mod tests { fn test_anonymous_output_path_with_parent() { let in_path = std::path::Path::new("some/dir/statement.pdf"); let out = anonymous_output_path(in_path); - assert_eq!(out, std::path::PathBuf::from("some/dir/anonymous_statement.pdf")); + assert_eq!( + out, + std::path::PathBuf::from("some/dir/anonymous_statement.pdf") + ); } #[test] From 062bf3c787f51340ac248618d0bd47566dbd9109 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 19 Dec 2025 21:32:57 +0100 Subject: [PATCH 21/21] Fix PDF text extraction to handle escaped parentheses Updated regex pattern to properly capture strings containing escaped parentheses (\( and \)) in PDF streams. Implemented PDF 1.3 spec-compliant unescape function supporting all escape sequences (\n, \r, \t, \b, \f, \(, \), \\, and \ddd octal). Now extracts 618 texts instead of 529, including previously missing 'NET CREDITS/(DEBITS)' and other parenthesized strings like dollar amounts and date ranges. Added comprehensive unit tests. --- src/anonymizer/pdf.rs | 166 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 164 insertions(+), 2 deletions(-) diff --git a/src/anonymizer/pdf.rs b/src/anonymizer/pdf.rs index b1e7dd7..f82c9da 100644 --- a/src/anonymizer/pdf.rs +++ b/src/anonymizer/pdf.rs @@ -180,23 +180,185 @@ pub(crate) fn extract_stream_bytes<'a>( /// Decompress stream and extract text tokens from PDF text operators /// Decompress a FlateDecode stream and extract text tokens appearing in `( .. ) Tj` operators. +/// Handles escaped parentheses `\(` and `\)` within PDF string literals. pub(crate) fn extract_texts_from_stream( compressed_data: &[u8], ) -> Result, Box> { let mut decoder = ZlibDecoder::new(compressed_data); let mut decompressed = Vec::new(); decoder.read_to_end(&mut decompressed)?; - let text_re = Regex::new(r"\(([^)]+)\)\s*Tj").map_err(|e| Box::new(e) as Box)?; + // Updated regex to handle escaped characters: (?:[^)\\]|\\.)* matches either + // a non-special char OR a backslash followed by any char (handles \(, \), \\, \n, etc.) + let text_re = + Regex::new(r"\(((?:[^)\\]|\\.)*)\)\s*Tj").map_err(|e| Box::new(e) as Box)?; let mut extracted_texts: Vec = Vec::new(); for text_caps in text_re.captures_iter(&decompressed) { if let Some(txt) = text_caps.get(1) { - extracted_texts.push(String::from_utf8_lossy(txt.as_bytes()).to_string()); + let text_bytes = txt.as_bytes(); + // Unescape PDF string literal escape sequences + let unescaped = unescape_pdf_string(text_bytes); + extracted_texts.push(String::from_utf8_lossy(&unescaped).to_string()); } } Ok(extracted_texts) } +/// Unescape PDF string literal escape sequences per PDF 1.3 spec (Table 3.2). +/// Handles: \n \r \t \b \f \( \) \\ and \ddd (octal, 1-3 digits). +/// Per spec: "If the character following the backslash is not one of those shown +/// in the table, the backslash is ignored." +fn unescape_pdf_string(data: &[u8]) -> Vec { + let mut result = Vec::with_capacity(data.len()); + let mut i = 0; + + while i < data.len() { + if data[i] == b'\\' && i + 1 < data.len() { + let (output, bytes_consumed) = handle_pdf_escape(&data[i + 1..]); + if let Some(byte) = output { + result.push(byte); + } + i += bytes_consumed; + } else { + result.push(data[i]); + i += 1; + } + } + result +} + +/// Handle a single PDF escape sequence starting after the backslash. +/// Returns (output byte if any, number of bytes to advance including the backslash). +fn handle_pdf_escape(data: &[u8]) -> (Option, usize) { + if data.is_empty() { + return (None, 1); // Lone backslash at end + } + + match data[0] { + b'n' => (Some(b'\n'), 2), + b'r' => (Some(b'\r'), 2), + b't' => (Some(b'\t'), 2), + b'b' => (Some(b'\x08'), 2), // backspace + b'f' => (Some(b'\x0C'), 2), // form feed + b'(' => (Some(b'('), 2), + b')' => (Some(b')'), 2), + b'\\' => (Some(b'\\'), 2), + b'0'..=b'7' => parse_pdf_octal_escape(data), + // Per spec: ignore backslash for unrecognized escapes + _ => (Some(data[0]), 2), + } +} + +/// Parse octal escape sequence \ddd (1-3 octal digits). +/// Returns (parsed byte, bytes consumed including backslash). +fn parse_pdf_octal_escape(data: &[u8]) -> (Option, usize) { + let mut end = 0; + // Consume up to 3 octal digits + while end < data.len() && end < 3 && data[end].is_ascii_digit() && data[end] <= b'7' { + end += 1; + } + + if let Ok(octal_str) = std::str::from_utf8(&data[..end]) { + if let Ok(value) = u8::from_str_radix(octal_str, 8) { + return (Some(value), end + 1); // +1 for the backslash + } + } + + // Fallback: ignore backslash if parsing fails + (Some(data[0]), 2) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_unescape_simple_escapes() { + // Test all simple escape sequences + assert_eq!(unescape_pdf_string(br"\n"), b"\n"); + assert_eq!(unescape_pdf_string(br"\r"), b"\r"); + assert_eq!(unescape_pdf_string(br"\t"), b"\t"); + assert_eq!(unescape_pdf_string(br"\b"), b"\x08"); // backspace + assert_eq!(unescape_pdf_string(br"\f"), b"\x0C"); // form feed + assert_eq!(unescape_pdf_string(br"\("), b"("); + assert_eq!(unescape_pdf_string(br"\)"), b")"); + assert_eq!(unescape_pdf_string(br"\\"), b"\\"); + } + + #[test] + fn test_unescape_octal_sequences() { + // Single digit octal + assert_eq!(unescape_pdf_string(br"\0"), b"\x00"); + assert_eq!(unescape_pdf_string(br"\7"), b"\x07"); + + // Two digit octal + assert_eq!(unescape_pdf_string(br"\53"), b"+"); // \053 = 43 decimal = '+' + + // Three digit octal + assert_eq!(unescape_pdf_string(br"\053"), b"+"); + assert_eq!(unescape_pdf_string(br"\245"), b"\xA5"); // 165 decimal + assert_eq!(unescape_pdf_string(br"\307"), b"\xC7"); // 199 decimal + + // Octal followed by non-digit (from PDF spec example) + assert_eq!(unescape_pdf_string(br"\0053"), b"\x053"); // \005 + '3' + } + + #[test] + fn test_unescape_real_world_case() { + // The actual case from the PDF that was failing + assert_eq!( + unescape_pdf_string(br"NET CREDITS/\(DEBITS\)"), + b"NET CREDITS/(DEBITS)" + ); + + // Dollar amount with parentheses + assert_eq!(unescape_pdf_string(br"\(6,085.80\)"), b"(6,085.80)"); + + // Date range + assert_eq!( + unescape_pdf_string(br"\(9/1/25-9/30/25\)"), + b"(9/1/25-9/30/25)" + ); + } + + #[test] + fn test_unescape_unrecognized_escape() { + // Per spec: "If the character following the backslash is not one of those + // shown in the table, the backslash is ignored." + assert_eq!(unescape_pdf_string(br"\x"), b"x"); + assert_eq!(unescape_pdf_string(br"\q"), b"q"); + assert_eq!(unescape_pdf_string(br"\Z"), b"Z"); + } + + #[test] + fn test_unescape_mixed_content() { + // Mix of regular text, escapes, and parentheses + assert_eq!( + unescape_pdf_string(br"Hello\nWorld\t\(test\)"), + b"Hello\nWorld\t(test)" + ); + + // \\ becomes \, then 053 is literal text (not preceded by backslash after unescape) + // Then \245 becomes byte 0xA5 + assert_eq!( + unescape_pdf_string(br"Price: \(\\053\245\)"), + b"Price: (\\053\xA5)" + ); + } + + #[test] + fn test_unescape_edge_cases() { + // Empty string + assert_eq!(unescape_pdf_string(b""), b""); + + // No escapes + assert_eq!(unescape_pdf_string(b"plain text"), b"plain text"); + + // Backslash at end (no following character) + assert_eq!(unescape_pdf_string(b"text\\"), b"text\\"); + } +} + // === Stream replacement & recompression utilities (migrated from streams.rs) === /// Replace all non-overlapping occurrences of `search` with `replace` in `data`.