diff --git a/Cargo.lock b/Cargo.lock index 0a0813e..7b0c369 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -92,10 +92,54 @@ dependencies = [ ] [[package]] -name = "ansi_term" -version = "0.9.0" +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23ac7c30002a5accbf7e8987d0632fa6de155b7c3d39d0067317a391e00a2ef6" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.0", +] [[package]] name = "argminmax" @@ -137,17 +181,6 @@ version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ae037714f313c1353189ead58ef9eec30a8e8dc101b2622d461418fd59e28a9" -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.5.0" @@ -175,12 +208,6 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" -[[package]] -name = "bitflags" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4efd02e230a02e18f92fc2735f44597385ed02ad8f831e7c1c1156ee5e1ab3a5" - [[package]] name = "bitflags" version = "1.3.2" @@ -313,19 +340,44 @@ dependencies = [ [[package]] name = "clap" -version = "2.27.1" +version = "4.5.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8c532887f1a292d17de05ae858a8fe50a301e196f9ef0ddb7ccd0d1d00f180" +checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00" dependencies = [ - "ansi_term", - "atty", - "bitflags 0.9.1", + "anstream", + "anstyle", + "clap_lex", "strsim", - "textwrap", - "unicode-width 0.1.14", - "vec_map", ] +[[package]] +name = "clap_derive" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "clap_lex" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" + [[package]] name = "cmake" version = "0.1.54" @@ -344,6 +396,12 @@ dependencies = [ "encoding_rs", ] +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + [[package]] name = "colored" version = "2.2.0" @@ -362,7 +420,7 @@ checksum = "b03b7db8e0b4b2fdad6c551e634134e99ec000e5c8c3b6856c65e8bbaded7a3b" dependencies = [ "crossterm", "unicode-segmentation", - "unicode-width 0.2.1", + "unicode-width", ] [[package]] @@ -602,6 +660,7 @@ dependencies = [ "chrono", "clap", "csv", + "flate2", "fltk", "holidays", "log", @@ -657,9 +716,9 @@ checksum = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d" [[package]] name = "flate2" -version = "1.1.2" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "miniz_oxide", @@ -861,13 +920,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] -name = "hermit-abi" -version = "0.1.19" +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "holidays" @@ -1127,6 +1183,12 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.10.5" @@ -1279,6 +1341,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -1399,6 +1462,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "opaque-debug" version = "0.3.1" @@ -2193,6 +2262,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + [[package]] name = "simdutf8" version = "0.1.5" @@ -2321,9 +2396,9 @@ dependencies = [ [[package]] name = "strsim" -version = "0.6.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4d15c810519a91cf877e7e36e63fe068815c678181439f2f29e2562147c3694" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum_macros" @@ -2331,7 +2406,7 @@ version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -2431,15 +2506,6 @@ dependencies = [ "windows-sys 0.61.0", ] -[[package]] -name = "textwrap" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0b59b6b4b44d867f1370ef1bd91bfb262bf07bf0ae65c202ea2fbc16153b693" -dependencies = [ - "unicode-width 0.1.14", -] - [[package]] name = "thiserror" version = "1.0.69" @@ -2633,12 +2699,6 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" -[[package]] -name = "unicode-width" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" - [[package]] name = "unicode-width" version = "0.2.1" @@ -2664,16 +2724,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] -name = "vcpkg" -version = "0.2.15" +name = "utf8parse" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] -name = "vec_map" -version = "0.8.2" +name = "vcpkg" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "version_check" diff --git a/Cargo.toml b/Cargo.toml index 3b16afc..b1979e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,9 @@ homepage = "https://github.com/jczaja/e-trade-tax-return-pl-helper" [[bin]] name = "etradeTaxReturnHelper" path = "src/main.rs" +[[bin]] +name = "etradeAnonymizer" +path = "src/anonymizer/anonymizer.rs" [[bin]] name = "gen_exchange_rates" @@ -35,7 +38,7 @@ serde = { version = "1.0.104", features = ["derive"] } roxmltree = "0.20.0" simple_logger = "4.0.0" log = "0.4.0" -clap = "~2.27.0" +clap = { version = "4.5.51", features = ["derive"] } regex = "1.3.3" calamine = "0.22.1" wild = "2.2.0" @@ -45,3 +48,5 @@ polars = "0.35.4" csv = "1.3.0" serde_json = { version = "=1.0.133", optional = true } holidays = { version = "0.1.0", default-features = false, features = ["PL"] } + +flate2 = "1.1.5" diff --git a/REUSE.toml b/REUSE.toml index d0c53c7..725c37d 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -22,6 +22,7 @@ path = [ "data/ecb_example_response.xml", "revolut_data/*.csv", "revolut_data/*.tsv", + "anonymizer_data/*", ] SPDX-FileCopyrightText = "2025 RustInFinance" SPDX-License-Identifier = "LicenseRef-Private-Data" diff --git a/anonymizer_data/expected_detect_output.txt b/anonymizer_data/expected_detect_output.txt new file mode 100644 index 0000000..8498362 --- /dev/null +++ b/anonymizer_data/expected_detect_output.txt @@ -0,0 +1 @@ +replace "sample_statement.pdf" "out_sample_statement.pdf" "JAN KOWALSKI" "XXXXXXXXXXXX" "UL. SWIETOKRZYSKA 12" "XXXXXXXXXXXXXXXXXXXX" "WARSAW 00-916 POLAND" "XXXXXXXXXXXXXXXXXXXX" "012 - 345678 - 910 -" "XXXXXXXXXXXXXXXXXXXX" "012-345678-910" "XXXXXXXXXXXXXX" \ No newline at end of file diff --git a/src/anonymizer/README.md b/src/anonymizer/README.md new file mode 100644 index 0000000..e1190da --- /dev/null +++ b/src/anonymizer/README.md @@ -0,0 +1,100 @@ + + +# etradeAnonymizer + +Minimal Rust tool for: +- Detecting personally identifiable information (PII) tokens in tightly structured PDF FlateDecode streams. +- Emitting a shell-friendly replace command line. +- Applying replacement strings while preserving original stream size (padding when needed). + +## Usage + +Detect mode (prints a replacement command suggestion): +``` +cargo run --bin etradeAnonymizer -- detect statement.pdf +``` + +Replace mode (apply explicit replacements): +``` +cargo run --bin etradeAnonymizer -- replace input.pdf output.pdf "JAN KOWALSKI" "XXXXX XXXXXXXX" +``` + +You can chain multiple pairs: +``` +cargo run --bin etradeAnonymizer -- replace in.pdf out.pdf "A" "X" "B" "Y" +``` + +## Build & Test +``` +cargo build --release --bin etradeAnonymizer +cargo test --bin etradeAnonymizer +``` + +Resulting binary: `target/release/etradeAnonymizer`. + +## Design Notes +- Strict PDF header (`%PDF-1.3\n`) enforcement; unsupported PDFs are skipped gracefully. This is for simplicity. +- Only FlateDecode streams with explicit `/Length` are processed as described below. +- Replacement recompresses; if no level fits original size, original compressed stream is kept. + +### Why Padding? (Architecture Note) +This tool avoids full PDF parsing and rebuilding. Instead, it modifies streams **in-place**. +- PDF files rely on a Cross-Reference (XREF) table that stores the byte offset of every object. +- If we changed the length of a stream object, all subsequent object offsets would shift, invalidating the XREF table. +- To avoid rebuilding the XREF table (which requires full PDF structure understanding), we ensure the modified stream is **exactly the same length** as the original. +- We achieve this by recompressing the modified text. If the new compressed data is smaller, we **pad** the remainder with null bytes (`0x00`). +- If the new compressed data is larger than the original (even at best compression), we cannot safely replace it without corrupting the file, so we fall back to keeping the original stream (and warn the user). + +### Exact PDF object pattern searched +The tool searches for PDF objects that exactly match the following pattern (both human-readable and via regex): + +Human-readable pattern: + +``` + obj +<< +/Length +/Filter [/FlateDecode] +>> +stream + +endstream +endobj +``` + +Regex used in code (PCRE-style): + +``` +(?s)\d+\s+\d+\s+obj\s*<<\s*/Length\s+(\d+)\s*/Filter\s*\[\s*/FlateDecode\s*\]\s*>>\s*stream\n +``` + +Only objects matching this pattern will be considered for detection and replacement for simplicity. + +## License +See `BSD-3-Clause` in `LICENSES/` directory. + +## Disclaimer + +Please note: this tool attempts to detect and replace common personally identifiable +information (PII) tokens in tightly structured PDF streams, but there is no guarantee +that all PII will be detected or removed. You must manually review the resulting +file and verify that sensitive information has been removed before sharing or +publishing. The maintainers make reasonable efforts to identify the following categories: + + - First & last name + - Mailing address (two lines) + - Account number + +These are the only PII categories we explicitly target for now. + +# Todo +## Detect +- Add note that Detect step is only for most sensitive data? + +## Replace +It's not valid to remove every occurance of Beginning Total Value (the amount in dollars) from the pdf because the same value might appear in section used for calculation. +- Keep the replace logic for most sensitive data. +- Introduce some additional data format for removing financial data (less sensitive). diff --git a/src/anonymizer/anonymizer.rs b/src/anonymizer/anonymizer.rs new file mode 100644 index 0000000..832dc87 --- /dev/null +++ b/src/anonymizer/anonymizer.rs @@ -0,0 +1,158 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! etradeAnonymizer - PDF anonymization tool for E*TRADE / Morgan Stanley statements. +//! +//! This tool provides three subcommands: +//! - `list`: List all text tokens from FlateDecode streams in a PDF +//! - `detect`: Heuristically detect PII and print a replacement command +//! - `replace`: Apply explicit string replacements to PDF FlateDecode streams +//! +//! The tool operates on tightly structured PDF FlateDecode streams and preserves +//! the original file structure by performing in-place replacements with exact-size matching. + +mod detect; +mod list; +mod path; +mod pdf; +mod replace; + +use clap::{Parser, Subcommand}; +use std::env; +use std::error::Error; +use std::path::PathBuf; + +/// Tool for anonymizing PDF files by replacing specific strings in FlateDecode streams +#[derive(Parser)] +#[command(name = "etradeAnonymizer")] +#[command(version, about, long_about = None)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + /// List all text tokens from FlateDecode streams in the PDF + List { + /// Path to the input PDF file + input_file: PathBuf, + }, + /// Detect PII (name, address, account) in the PDF and print replacement command + Detect { + /// Path to the input PDF file + input_file: PathBuf, + }, + /// Replace strings in PDF FlateDecode streams and save to output file + Replace { + /// Path to the input PDF file + input_file: PathBuf, + /// Path to the output PDF file + output_file: PathBuf, + /// Pairs of strings to replace: `"" "" "" "" ...` + #[arg(required = true, num_args = 2..)] + replacements: Vec, + }, +} + +fn main() -> Result<(), Box> { + // Ensure users see warnings and errors by default even when RUST_LOG is not set. + // If RUST_LOG is provided, simple_logger will respect it; otherwise we default to `warn`. + if env::var("RUST_LOG").is_err() { + env::set_var("RUST_LOG", "warn"); + } + simple_logger::SimpleLogger::new().env().init().unwrap(); + + let cli = Cli::parse(); + + match cli.command { + Commands::List { input_file } => list::list_texts(&input_file), + Commands::Detect { input_file } => detect::detect_pii(&input_file), + Commands::Replace { + input_file, + output_file, + replacements, + } => { + if replacements.len() % 2 != 0 { + return Err( + "Replacements must be provided as pairs: ".into(), + ); + } + let replacement_pairs: Vec<(String, String)> = replacements + .chunks(2) + .map(|chunk| (chunk[0].clone(), chunk[1].clone())) + .collect(); + replace::replace_pii(&input_file, &output_file, &replacement_pairs) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + + // Note: These tests require 'anonymizer_data' directory to be present in the working directory + // when running 'cargo test'. + + #[test] + fn test_detect_mode() -> Result<(), Box> { + let sample = std::path::Path::new("anonymizer_data/sample_statement.pdf"); + assert!( + sample.exists(), + "Required test file missing: {}", + sample.display() + ); + + detect::detect_pii(sample)?; + Ok(()) + } + + #[test] + fn test_replace_pii() -> Result<(), Box> { + let sample = std::path::Path::new("anonymizer_data/sample_statement.pdf"); + let expected_pdf = std::path::Path::new("anonymizer_data/expected_statement.pdf"); + let output_dir = "target/test_outputs"; + let output_pdf = std::path::Path::new("target/test_outputs/out_sample_statement.pdf"); + + assert!( + sample.exists(), + "Required test file missing: {}", + sample.display() + ); + assert!( + expected_pdf.exists(), + "Required test file missing: {}", + expected_pdf.display() + ); + + fs::create_dir_all(output_dir)?; + + let replacements = vec![ + ("JAN KOWALSKI".to_string(), "XXXXXXXXXXXX".to_string()), + ( + "UL. SWIETOKRZYSKA 12".to_string(), + "XXXXXXXXXXXXXXXXXXXX".to_string(), + ), + ( + "WARSAW 00-916 POLAND".to_string(), + "XXXXXXXXXXXXXXXXXXXX".to_string(), + ), + ( + "012 - 345678 - 910 -".to_string(), + "XXXXXXXXXXXXXXXXXXXX".to_string(), + ), + ("012-345678-910".to_string(), "XXXXXXXXXXXXXX".to_string()), + ]; + + replace::replace_pii(sample, output_pdf, &replacements)?; + + let produced = fs::read(output_pdf)?; + let expected = fs::read(expected_pdf)?; + assert_eq!(produced, expected, "produced PDF differs from expected"); + + // Cleanup + let _ = fs::remove_file(output_pdf); + Ok(()) + } +} diff --git a/src/anonymizer/detect.rs b/src/anonymizer/detect.rs new file mode 100644 index 0000000..78b2673 --- /dev/null +++ b/src/anonymizer/detect.rs @@ -0,0 +1,373 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! PII detection module for anonymizer. +//! +//! This module provides heuristic detection of personally identifiable information (PII) +//! in E*TRADE / Morgan Stanley PDF statement FlateDecode streams. It searches for: +//! - Recipient code (ID) +//! - Name +//! - Address lines (two lines) +//! - Account numbers (spaced and non-spaced formats) +//! +//! Detection is based on anchor text patterns and relative offsets within the token stream. +//! Once all PII categories are found, the module prints a `replace` command suitable for +//! shell invocation with the detected tokens. + +use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; +use log::{debug, info, warn}; +use std::error::Error; + +/// Configuration for locating a token via an anchor text and offset. +/// +/// The `text` field identifies an anchor string in the token stream, +/// and `offset` specifies how many tokens ahead the target token is located. +pub(crate) struct AnchorOffset { + pub text: &'static str, + pub offset: usize, +} + +/// Detection configuration specifying anchor patterns for each PII category. +/// +/// Each field is an `AnchorOffset` that defines the anchor text and relative position +/// of the target PII token within the extracted text stream. +pub(crate) struct DetectionConfig { + pub account: AnchorOffset, + pub account_spaced: AnchorOffset, + pub name: AnchorOffset, + pub recipient_code: AnchorOffset, + pub recipient_address_line1: AnchorOffset, + pub recipient_address_line2: AnchorOffset, +} + +// Find the first `to_be_redacted = anchor + offset`. Replace all `to_be_redacted` you can find. For most sensitive data. +impl Default for DetectionConfig { + fn default() -> Self { + Self { + // [148] 012-345678-910 + account: AnchorOffset { + text: "Morgan Stanley at Work Self-Directed Account", + offset: 1, + }, + // [10] 012 - 345678 - 910 - + account_spaced: AnchorOffset { + text: "For the Period", + offset: 3, + }, + // [14] JAN KOWALSKI + name: AnchorOffset { + text: "FOR:", + offset: 1, + }, + /* + [18] #ABCDEFG + [19] JAN KOWALSKI + [20] UL. SWIETOKRZYSKA 12 + [21] WARSAW 00-916 POLAND + */ + // recipient tokens follow the same anchor; offsets are 1, 3, 4 + recipient_code: AnchorOffset { + text: "E*TRADE is a business of Morgan Stanley.", + offset: 1, + }, + recipient_address_line1: AnchorOffset { + text: "E*TRADE is a business of Morgan Stanley.", + offset: 3, + }, + recipient_address_line2: AnchorOffset { + text: "E*TRADE is a business of Morgan Stanley.", + offset: 4, + }, + } + } +} + +/// Result of PII detection, holding detected tokens for each category. +/// +/// Fields are `None` if the corresponding PII was not found. +#[derive(Default, Debug)] +pub(crate) struct DetectionResult { + id: Option, + name: Option, + address_line1: Option, + address_line2: Option, + account_spaced: Option, + account_ms: Option, +} + +impl DetectionResult { + fn all_found(&self) -> bool { + self.id.is_some() + && self.name.is_some() + && self.address_line1.is_some() + && self.address_line2.is_some() + && self.account_spaced.is_some() + && self.account_ms.is_some() + } +} + +/// Detect PII tokens in `input_path` and print a replacement command line. +/// +/// The function inspects FlateDecode streams, extracts text tokens and heuristically +/// determines name/address/account tokens. It prints a single `replace` command +/// suitable for shell use. +pub fn detect_pii(input_path: &std::path::Path) -> Result<(), Box> { + let pdf_data = read_pdf(input_path)?; + + let config = DetectionConfig::default(); + let mut result = DetectionResult::default(); + + for stream in stream_scanner(&pdf_data) { + if !stream.valid_end_marker { + warn!( + "Skipping stream due to end-marker mismatch for object at {}", + stream.object_start + ); + continue; + } + match extract_texts_from_stream(stream.compressed) { + Ok(extracted) => { + result = analyze_extracted_texts(&extracted, &config); + if result.all_found() { + debug!("All target PII categories found."); + break; + } + } + Err(e) => { + warn!( + "Failed to extract texts from stream at {}: {}", + stream.object_start, e + ); + } + } + } + + let out_path = super::path::anonymous_output_path(input_path); + + // Build final ordered list: name, addr1, addr2, account_spaced, account_ms + let mut final_texts: Vec = Vec::new(); + let mut inserted = std::collections::HashSet::new(); + if let Some(id) = &result.id { + if inserted.insert(id.clone()) { + final_texts.push(id.clone()); + } + } + if let Some(n) = &result.name { + if inserted.insert(n.clone()) { + final_texts.push(n.clone()); + } + } + if let Some(a1) = &result.address_line1 { + if inserted.insert(a1.clone()) { + final_texts.push(a1.clone()); + } + } + if let Some(a2) = &result.address_line2 { + if inserted.insert(a2.clone()) { + final_texts.push(a2.clone()); + } + } + if let Some(sp) = &result.account_spaced { + if inserted.insert(sp.clone()) { + final_texts.push(sp.clone()); + } + } + if let Some(ms) = &result.account_ms { + if inserted.insert(ms.clone()) { + final_texts.push(ms.clone()); + } + } + + print!( + "replace \"{}\" \"{}\"", + input_path.display(), + out_path.display() + ); + for txt in &final_texts { + let replacement = "X".repeat(txt.len()); + print!(" \"{}\" \"{}\"", txt, replacement); + } + println!(); + + Ok(()) +} + +pub(crate) fn analyze_extracted_texts( + extracted_texts: &[String], + config: &DetectionConfig, +) -> DetectionResult { + debug!("Analyzing {} extracted tokens", extracted_texts.len()); + for (i, txt) in extracted_texts.iter().enumerate() { + debug!(" [{}] {}", i, txt); + } + + let id = get_string_by_anchor(&config.recipient_code, extracted_texts); + let name = get_string_by_anchor(&config.name, extracted_texts); + let address_line1 = get_string_by_anchor(&config.recipient_address_line1, extracted_texts); + let address_line2 = get_string_by_anchor(&config.recipient_address_line2, extracted_texts); + let account_spaced = get_string_by_anchor(&config.account_spaced, extracted_texts); + let account_ms = get_string_by_anchor(&config.account, extracted_texts); + // Log what we found or didn't find + if let Some(ref v) = id { + info!("Found recipient code: {}", v); + } else { + warn!( + "Recipient code not found via anchor: {}", + config.recipient_code.text + ); + } + if let Some(ref v) = name { + info!("Found name: {}", v); + } else { + warn!("Name not found via anchor: {}", config.name.text); + } + if let Some(ref v) = address_line1 { + info!("Found address_line1: {}", v); + } else { + warn!( + "Address line 1 not found via anchor: {}", + config.recipient_address_line1.text + ); + } + if let Some(ref v) = address_line2 { + info!("Found address_line2: {}", v); + } else { + warn!( + "Address line 2 not found via anchor: {}", + config.recipient_address_line2.text + ); + } + if let Some(ref v) = account_spaced { + info!("Found spaced account: {}", v); + } else { + warn!( + "Spaced account not found via anchor: {}", + config.account_spaced.text + ); + } + if let Some(ref v) = account_ms { + info!("Found ms account: {}", v); + } else { + warn!("MS account not found via anchor: {}", config.account.text); + } + + let acct_validation = validate_account_match(&account_spaced, &account_ms); + match acct_validation { + Some(true) => info!("Account validation: MATCH"), + Some(false) => warn!("Account validation: MISMATCH"), + None => warn!("Account validation: SKIPPED (missing token)"), + } + DetectionResult { + id, + name, + address_line1, + address_line2, + account_spaced, + account_ms, + } +} + +fn get_string_by_anchor( + anchor_offset: &AnchorOffset, + extracted_texts: &[String], +) -> Option { + for (idx, t) in extracted_texts.iter().enumerate() { + if t.contains(anchor_offset.text) { + let target_idx = idx + anchor_offset.offset; + if target_idx < extracted_texts.len() { + return Some(extracted_texts[target_idx].clone()); + } + } + } + None +} + +// Validate account spaced vs non-spaced (compare digits-only). Logs a warning on mismatch. +fn validate_account_match(spaced: &Option, ms: &Option) -> Option { + let digits_only = |s: &str| s.chars().filter(|c| c.is_numeric()).collect::(); + + match (spaced, ms) { + (Some(s), Some(m)) => { + let ds = digits_only(s); + let dm = digits_only(m); + Some(ds == dm) + } + _ => { + // One or both values missing; nothing to validate. + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_analyze_extracted_texts() { + // Semi-realistic token stream + let tokens: Vec = [ + "Beginning Total Value ", + "$", + "12,345.67", + "Ending Total Value ", + "$1.23", + "Includes Accrued Interest", + "CLIENT STATEMENT ", + "For the Period September 1", + "-", + "30, 2025", + "012 - 345678 - 910 -", + "4 - 1", + "STATEMENT", + " FOR:", + "John Doe", + "", + "Morgan Stanley Smith Barney LLC. Member SIPC.", + "E*TRADE is a business of Morgan Stanley.", + "#ABCDEFG", + "John Doe", + "123 Market St", + "Cityville 12345 WHOKNOWS", + "Account Details", + "Morgan Stanley at Work Self-Directed Account", + "987654321", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + let config = DetectionConfig::default(); + let res = analyze_extracted_texts(&tokens, &config); + assert_eq!(res.name, Some("John Doe".to_string())); + assert_eq!(res.address_line1, Some("123 Market St".to_string())); + assert_eq!( + res.address_line2, + Some("Cityville 12345 WHOKNOWS".to_string()) + ); + assert_eq!(res.account_ms, Some("987654321".to_string())); + } + + #[test] + fn test_validate_account_match_matching() { + let spaced = Some("012 - 345678 - 910 -".to_string()); + let ms = Some("012345678910".to_string()); + let res = validate_account_match(&spaced, &ms); + assert_eq!(res, Some(true)); + } + + #[test] + fn test_validate_account_match_mismatch() { + let spaced = Some("012 - 345678 - 910 -".to_string()); + let ms = Some("987654321".to_string()); + let res = validate_account_match(&spaced, &ms); + assert_eq!(res, Some(false)); + } + + #[test] + fn test_validate_account_match_missing() { + let spaced: Option = None; + let ms = Some("987654321".to_string()); + let res = validate_account_match(&spaced, &ms); + assert_eq!(res, None); + } +} diff --git a/src/anonymizer/list.rs b/src/anonymizer/list.rs new file mode 100644 index 0000000..4931e02 --- /dev/null +++ b/src/anonymizer/list.rs @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! Text listing module for anonymizer. +//! +//! This module provides functionality to extract and list all text tokens from +//! FlateDecode streams in a PDF. Each token is printed with a global index, +//! useful for understanding the structure and content of the PDF before detection/replacement. + +use crate::pdf::{extract_texts_from_stream, read_pdf, stream_scanner}; +use log::{info, warn}; +use std::error::Error; +use std::path::Path; + +/// List all text tokens from FlateDecode streams in the PDF at `input_path`. +/// +/// Prints each extracted token with a global index to stdout. +/// Logs warnings for streams that fail to decompress or have invalid markers. +/// +/// # Arguments +/// * `input_path` - Path to the input PDF file. +/// +/// # Returns +/// `Ok(())` on success, or an error if the PDF cannot be read. +pub fn list_texts(input_path: &Path) -> Result<(), Box> { + let pdf_data = read_pdf(input_path)?; + + let mut global_text_id = 0; + for (stream_id, stream) in stream_scanner(&pdf_data).enumerate() { + if !stream.valid_end_marker { + warn!( + "Skipping stream due to end-marker mismatch for object at {}", + stream.object_start + ); + continue; + } + match extract_texts_from_stream(stream.compressed) { + Ok(extracted_texts) => { + info!( + "stream {} has {} extracted tokens", + stream_id, + extracted_texts.len() + ); + for txt in extracted_texts.iter() { + println!(" [{}] {}", global_text_id, txt); + global_text_id += 1; + } + } + Err(e) => { + warn!( + "Failed to extract texts from stream at {}: {}", + stream.object_start, e + ); + } + } + } + Ok(()) +} diff --git a/src/anonymizer/path.rs b/src/anonymizer/path.rs new file mode 100644 index 0000000..cd94b05 --- /dev/null +++ b/src/anonymizer/path.rs @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! Path utility module for anonymizer. +//! +//! Provides helper functions for generating anonymized output file paths +//! by prefixing filenames with `anonymous_` while preserving directory structure. + +use std::path::PathBuf; + +/// Build an output path by prefixing the input filename with `anonymous_`. +/// +/// Preserves the parent directory if present and returns a `PathBuf`. +/// +/// # Examples +/// ```ignore +/// use std::path::Path; +/// let input = Path::new("data/statement.pdf"); +/// let output = anonymous_output_path(input); +/// assert_eq!(output, Path::new("data/anonymous_statement.pdf")); +/// ``` +pub(crate) fn anonymous_output_path(input_path: &std::path::Path) -> PathBuf { + let file_name = input_path + .file_name() + .map(|s| s.to_string_lossy().into_owned()) + .unwrap_or_else(|| input_path.to_string_lossy().into_owned()); + + if let Some(parent) = input_path.parent() { + let mut pb = PathBuf::from(parent); + pb.push(format!("anonymous_{}", file_name)); + pb + } else { + PathBuf::from(format!("anonymous_{}", file_name)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_anonymous_output_path_no_parent() { + let in_path = std::path::Path::new("statement.pdf"); + let out = anonymous_output_path(in_path); + assert_eq!(out, std::path::PathBuf::from("anonymous_statement.pdf")); + } + + #[test] + fn test_anonymous_output_path_with_parent() { + let in_path = std::path::Path::new("some/dir/statement.pdf"); + let out = anonymous_output_path(in_path); + assert_eq!( + out, + std::path::PathBuf::from("some/dir/anonymous_statement.pdf") + ); + } + + #[test] + fn test_anonymous_output_path_unicode_filename() { + let in_path = std::path::Path::new("résumé.pdf"); + let out = anonymous_output_path(in_path); + assert_eq!(out, std::path::PathBuf::from("anonymous_résumé.pdf")); + } +} diff --git a/src/anonymizer/pdf.rs b/src/anonymizer/pdf.rs new file mode 100644 index 0000000..f82c9da --- /dev/null +++ b/src/anonymizer/pdf.rs @@ -0,0 +1,459 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! PDF parsing utilities: header validation, stream extraction, text token parsing. +//! This module is intentionally strict and only supports a narrow subset of PDF +//! objects used by the target documents: FlateDecode streams with explicit /Length. + +use flate2::read::ZlibDecoder; +use flate2::write::ZlibEncoder; +use flate2::Compression; +use log::{debug, error, info, warn}; +use regex::bytes::Regex; +use std::error::Error; +use std::fs::File; +use std::io::{Read, Write}; + +// Centralized constants and helpers for PDF parsing to reduce duplication between detect/replace. +/// Expected PDF header (strictly enforced). +pub(crate) const PDF_HEADER: &[u8] = b"%PDF-1.3\n"; +/// Regex matching an object with FlateDecode stream and explicit /Length. +pub(crate) const OBJ_STREAM_RE: &str = r"(?s)\d+\s+\d+\s+obj\s*<<\s*/Length\s+(\d+)\s*/Filter\s*\[\s*/FlateDecode\s*\]\s*>>\s*stream\n"; + +/// Read entire PDF file and validate strict header. +pub(crate) fn read_pdf(path: &std::path::Path) -> Result, Box> { + let mut file = File::open(path)?; + let mut pdf_data = Vec::new(); + file.read_to_end(&mut pdf_data)?; + if pdf_data.len() < PDF_HEADER.len() || &pdf_data[0..PDF_HEADER.len()] != PDF_HEADER { + error!( + "Unsupported PDF version or invalid PDF header at '{}'.", + path.display() + ); + return Err("Invalid PDF header".into()); + } + Ok(pdf_data) +} + +// Lightweight representation of a PDF flate stream for detection-only workflow. +/// Lightweight representation of a FlateDecode stream slice inside a PDF. +pub(crate) struct StreamData<'a> { + pub object_start: usize, + pub data_start: usize, + pub compressed: &'a [u8], + pub valid_end_marker: bool, +} + +/// Iterator over stream objects, avoiding allocating a full Vec upfront. +pub(crate) struct StreamScanner<'a> { + re: Regex, + data: &'a [u8], + search_from: usize, +} + +/// Create a new streaming iterator over PDF FlateDecode objects. +pub(crate) fn stream_scanner<'a>(pdf_data: &'a [u8]) -> StreamScanner<'a> { + StreamScanner { + re: Regex::new(OBJ_STREAM_RE).unwrap(), + data: pdf_data, + search_from: 0, + } +} + +impl<'a> Iterator for StreamScanner<'a> { + type Item = StreamData<'a>; + fn next(&mut self) -> Option { + while self.search_from < self.data.len() { + // Use captures_at to find next match at current position + if let Some(caps) = self.re.captures_at(self.data, self.search_from) { + let whole = caps.get(0)?; + // advance search past this object to avoid infinite loop + self.search_from = whole.end(); + if let Some((compressed, data_start, valid)) = + extract_stream_bytes(self.data, &caps) + { + return Some(StreamData { + object_start: whole.start(), + data_start, + compressed, + valid_end_marker: valid, + }); + } else { + continue; // skip invalid capture + } + } else { + self.search_from = self.data.len(); + } + } + None + } +} + +/// Given a regex capture for an object, validate endmarker and return compressed stream bytes +/// Given a capture for a stream object, validate end marker and return the raw compressed data plus a validity flag. +pub(crate) fn extract_stream_bytes<'a>( + pdf_data: &'a [u8], + caps: ®ex::bytes::Captures<'a>, +) -> Option<(&'a [u8], usize, bool)> { + // Strict project rule: expected end marker is fixed here + const EXPECTED_END: &[u8] = b"\nendstream\nendobj"; + // Validate capture groups + let whole = match caps.get(0) { + Some(m) => m, + None => { + error!("PDF object capture missing whole-match"); + return None; + } + }; + let length_bytes = match caps.get(1) { + Some(m) => m.as_bytes(), + None => { + error!( + "PDF object capture missing /Length group at object starting {}", + whole.start() + ); + return None; + } + }; + + // Parse length strictly; if it fails, we consider this object invalid + let length = match std::str::from_utf8(length_bytes) + .ok() + .and_then(|s| s.parse::().ok()) + { + Some(v) => v, + None => { + error!( + "Invalid /Length value '{}' in object starting at {}", + String::from_utf8_lossy(length_bytes), + whole.start() + ); + return None; + } + }; + + let data_start = whole.end(); + let stream_end = match data_start.checked_add(length) { + Some(v) => v, + None => { + error!( + "Stream end overflow for object at {} (length={})", + data_start, length + ); + return None; + } + }; + + // strict bounds checks: must be entirely within pdf_data + if stream_end > pdf_data.len() { + error!( + "Stream end out of bounds for object starting at {}: stream_end={} pdf_len={}", + data_start, + stream_end, + pdf_data.len() + ); + return None; + } + if stream_end + EXPECTED_END.len() > pdf_data.len() { + error!( + "End marker out of bounds after stream_end {} for object starting at {} (pdf_len={})", + stream_end, + data_start, + pdf_data.len() + ); + return None; + } + + // Validate exact end marker (requirements are strict) + let debug_slice = &pdf_data[stream_end..stream_end + EXPECTED_END.len()]; + if debug_slice != EXPECTED_END { + warn!( + "End marker mismatch for object starting at {}: found {:?}, expected {:?}", + data_start, debug_slice, EXPECTED_END + ); + // Return decompressed candidate but indicate end marker mismatch for caller decision + return Some((&pdf_data[data_start..stream_end], data_start, false)); + } + + Some((&pdf_data[data_start..stream_end], data_start, true)) +} + +/// Decompress stream and extract text tokens from PDF text operators +/// Decompress a FlateDecode stream and extract text tokens appearing in `( .. ) Tj` operators. +/// Handles escaped parentheses `\(` and `\)` within PDF string literals. +pub(crate) fn extract_texts_from_stream( + compressed_data: &[u8], +) -> Result, Box> { + let mut decoder = ZlibDecoder::new(compressed_data); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed)?; + // Updated regex to handle escaped characters: (?:[^)\\]|\\.)* matches either + // a non-special char OR a backslash followed by any char (handles \(, \), \\, \n, etc.) + let text_re = + Regex::new(r"\(((?:[^)\\]|\\.)*)\)\s*Tj").map_err(|e| Box::new(e) as Box)?; + let mut extracted_texts: Vec = Vec::new(); + for text_caps in text_re.captures_iter(&decompressed) { + if let Some(txt) = text_caps.get(1) { + let text_bytes = txt.as_bytes(); + // Unescape PDF string literal escape sequences + let unescaped = unescape_pdf_string(text_bytes); + extracted_texts.push(String::from_utf8_lossy(&unescaped).to_string()); + } + } + + Ok(extracted_texts) +} + +/// Unescape PDF string literal escape sequences per PDF 1.3 spec (Table 3.2). +/// Handles: \n \r \t \b \f \( \) \\ and \ddd (octal, 1-3 digits). +/// Per spec: "If the character following the backslash is not one of those shown +/// in the table, the backslash is ignored." +fn unescape_pdf_string(data: &[u8]) -> Vec { + let mut result = Vec::with_capacity(data.len()); + let mut i = 0; + + while i < data.len() { + if data[i] == b'\\' && i + 1 < data.len() { + let (output, bytes_consumed) = handle_pdf_escape(&data[i + 1..]); + if let Some(byte) = output { + result.push(byte); + } + i += bytes_consumed; + } else { + result.push(data[i]); + i += 1; + } + } + result +} + +/// Handle a single PDF escape sequence starting after the backslash. +/// Returns (output byte if any, number of bytes to advance including the backslash). +fn handle_pdf_escape(data: &[u8]) -> (Option, usize) { + if data.is_empty() { + return (None, 1); // Lone backslash at end + } + + match data[0] { + b'n' => (Some(b'\n'), 2), + b'r' => (Some(b'\r'), 2), + b't' => (Some(b'\t'), 2), + b'b' => (Some(b'\x08'), 2), // backspace + b'f' => (Some(b'\x0C'), 2), // form feed + b'(' => (Some(b'('), 2), + b')' => (Some(b')'), 2), + b'\\' => (Some(b'\\'), 2), + b'0'..=b'7' => parse_pdf_octal_escape(data), + // Per spec: ignore backslash for unrecognized escapes + _ => (Some(data[0]), 2), + } +} + +/// Parse octal escape sequence \ddd (1-3 octal digits). +/// Returns (parsed byte, bytes consumed including backslash). +fn parse_pdf_octal_escape(data: &[u8]) -> (Option, usize) { + let mut end = 0; + // Consume up to 3 octal digits + while end < data.len() && end < 3 && data[end].is_ascii_digit() && data[end] <= b'7' { + end += 1; + } + + if let Ok(octal_str) = std::str::from_utf8(&data[..end]) { + if let Ok(value) = u8::from_str_radix(octal_str, 8) { + return (Some(value), end + 1); // +1 for the backslash + } + } + + // Fallback: ignore backslash if parsing fails + (Some(data[0]), 2) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_unescape_simple_escapes() { + // Test all simple escape sequences + assert_eq!(unescape_pdf_string(br"\n"), b"\n"); + assert_eq!(unescape_pdf_string(br"\r"), b"\r"); + assert_eq!(unescape_pdf_string(br"\t"), b"\t"); + assert_eq!(unescape_pdf_string(br"\b"), b"\x08"); // backspace + assert_eq!(unescape_pdf_string(br"\f"), b"\x0C"); // form feed + assert_eq!(unescape_pdf_string(br"\("), b"("); + assert_eq!(unescape_pdf_string(br"\)"), b")"); + assert_eq!(unescape_pdf_string(br"\\"), b"\\"); + } + + #[test] + fn test_unescape_octal_sequences() { + // Single digit octal + assert_eq!(unescape_pdf_string(br"\0"), b"\x00"); + assert_eq!(unescape_pdf_string(br"\7"), b"\x07"); + + // Two digit octal + assert_eq!(unescape_pdf_string(br"\53"), b"+"); // \053 = 43 decimal = '+' + + // Three digit octal + assert_eq!(unescape_pdf_string(br"\053"), b"+"); + assert_eq!(unescape_pdf_string(br"\245"), b"\xA5"); // 165 decimal + assert_eq!(unescape_pdf_string(br"\307"), b"\xC7"); // 199 decimal + + // Octal followed by non-digit (from PDF spec example) + assert_eq!(unescape_pdf_string(br"\0053"), b"\x053"); // \005 + '3' + } + + #[test] + fn test_unescape_real_world_case() { + // The actual case from the PDF that was failing + assert_eq!( + unescape_pdf_string(br"NET CREDITS/\(DEBITS\)"), + b"NET CREDITS/(DEBITS)" + ); + + // Dollar amount with parentheses + assert_eq!(unescape_pdf_string(br"\(6,085.80\)"), b"(6,085.80)"); + + // Date range + assert_eq!( + unescape_pdf_string(br"\(9/1/25-9/30/25\)"), + b"(9/1/25-9/30/25)" + ); + } + + #[test] + fn test_unescape_unrecognized_escape() { + // Per spec: "If the character following the backslash is not one of those + // shown in the table, the backslash is ignored." + assert_eq!(unescape_pdf_string(br"\x"), b"x"); + assert_eq!(unescape_pdf_string(br"\q"), b"q"); + assert_eq!(unescape_pdf_string(br"\Z"), b"Z"); + } + + #[test] + fn test_unescape_mixed_content() { + // Mix of regular text, escapes, and parentheses + assert_eq!( + unescape_pdf_string(br"Hello\nWorld\t\(test\)"), + b"Hello\nWorld\t(test)" + ); + + // \\ becomes \, then 053 is literal text (not preceded by backslash after unescape) + // Then \245 becomes byte 0xA5 + assert_eq!( + unescape_pdf_string(br"Price: \(\\053\245\)"), + b"Price: (\\053\xA5)" + ); + } + + #[test] + fn test_unescape_edge_cases() { + // Empty string + assert_eq!(unescape_pdf_string(b""), b""); + + // No escapes + assert_eq!(unescape_pdf_string(b"plain text"), b"plain text"); + + // Backslash at end (no following character) + assert_eq!(unescape_pdf_string(b"text\\"), b"text\\"); + } +} + +// === Stream replacement & recompression utilities (migrated from streams.rs) === + +/// Replace all non-overlapping occurrences of `search` with `replace` in `data`. +fn replace_bytes_all_occurrences(data: &[u8], search: &[u8], replace: &[u8]) -> (Vec, usize) { + let mut result = Vec::new(); + let mut pos = 0; + let mut count = 0; + while pos < data.len() { + if pos + search.len() <= data.len() && &data[pos..pos + search.len()] == search { + result.extend_from_slice(replace); + pos += search.len(); + count += 1; + } else { + result.push(data[pos]); + pos += 1; + } + } + (result, count) +} + +/// Try progressive zlib compression levels (0..=9) returning the first compressed form whose length is <= `max_size`. +fn find_fitting_compression(data: &[u8], max_size: usize) -> Option<(Vec, u32)> { + for level in 0..=9 { + let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(level)); + if encoder.write_all(data).is_err() { + continue; + } + let compressed = encoder.finish().ok()?; + if compressed.len() <= max_size { + return Some((compressed, level)); + } + } + None +} + +/// Decompress a stream, apply all replacements, and recompress if possible within +/// the original compressed size. Returns new compressed bytes and per-pattern counts. +pub(crate) fn process_stream( + compressed_data: &[u8], + replacements: &[(String, String)], +) -> Result<(Vec, std::collections::HashMap), Box> { + let original_len = compressed_data.len(); + let mut decoder = ZlibDecoder::new(compressed_data); + let mut decompressed = Vec::new(); + match decoder.read_to_end(&mut decompressed) { + Ok(_) => { + debug!("Decompressed: {} B", decompressed.len()); + let mut modified_data = decompressed.clone(); + let mut found_any = false; + let mut per_counts: std::collections::HashMap = + std::collections::HashMap::new(); + for (needle, repl) in replacements { + let (new_data, occurrences) = replace_bytes_all_occurrences( + &modified_data, + needle.as_bytes(), + repl.as_bytes(), + ); + if occurrences > 0 { + debug!("Found '{}' {} times", needle, occurrences); + modified_data = new_data; + per_counts.insert(needle.clone(), occurrences); + found_any = true; + } + } + if found_any { + if let Some((fitting, level)) = + find_fitting_compression(&modified_data, original_len) + { + debug!( + "Compression level {} produced {} B (<= {} B)", + level, + fitting.len(), + original_len + ); + info!( + "Compressed stream with level {} ({} B)", + level, + fitting.len() + ); + return Ok((fitting, per_counts)); + } else { + warn!( + "All compression levels exceed original size {}; keeping original. PII MAY REMAIN EXPOSED!", + original_len + ); + info!( + "Falling back to original compressed stream ({} B)", + original_len + ); + } + } + } + Err(e) => { + error!("Decompression error: {}", e); + } + } + Ok((compressed_data.to_vec(), std::collections::HashMap::new())) +} diff --git a/src/anonymizer/replace.rs b/src/anonymizer/replace.rs new file mode 100644 index 0000000..6a784b7 --- /dev/null +++ b/src/anonymizer/replace.rs @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: 2025 RustInFinance +// SPDX-License-Identifier: BSD-3-Clause + +//! String replacement module for anonymizer. +//! +//! This module applies specified text replacements to all FlateDecode streams in a PDF. +//! For each stream, the module: +//! 1. Decompresses the stream data +//! 2. Applies all specified string replacements +//! 3. Recompresses the modified text to the exact original size (with padding if necessary) +//! 4. Writes the modified PDF to the output file +//! +//! The in-place replacement strategy avoids rebuilding the PDF's XREF table, +//! ensuring the output PDF remains valid without full PDF structure parsing. + +use super::pdf::{process_stream, read_pdf, stream_scanner}; +use log::{debug, info, warn}; +use std::fs::File; +use std::io::Write; +use std::path::Path; + +/// Replace occurrences of given `replacements` inside FlateDecode streams of `input_path` and +/// write modified PDF to `output_path`. +/// +/// Each replacement is a `(original, replacement)` pair. Compression is retried to fit +/// the original stream size; if impossible, the original compressed stream is preserved. +pub(crate) fn replace_pii( + input_path: &Path, + output_path: &Path, + replacements: &[(String, String)], +) -> Result<(), Box> { + info!("Loading: {}", input_path.display()); + + let pdf_data = match read_pdf(input_path) { + Ok(d) => d, + Err(_) => return Ok(()), // header or open error already logged + }; + + debug!("PDF Size: {} bytes", pdf_data.len()); + + let mut output_data = pdf_data.clone(); + let mut streams_modified = 0; + let mut streams_total = 0; + // Aggregate counts per replacement for concise summary + let mut replacement_counts: std::collections::HashMap = + std::collections::HashMap::new(); + + for stream in stream_scanner(&pdf_data) { + if !stream.valid_end_marker { + warn!( + "Skipping stream due to end-marker mismatch for object at {}", + stream.object_start + ); + continue; + } + streams_total += 1; + + let compressed_data = stream.compressed; + let data_start = stream.data_start; + let stream_end = data_start + compressed_data.len(); + + // decompress modify recompress to exact same size + debug!("═══ Stream #{} ═══", streams_total); + debug!( + "Position: {}-{} ({} B)", + data_start, + stream_end, + compressed_data.len() + ); + let (new_compressed_data, stream_replacement_counts) = + process_stream(compressed_data, replacements)?; + // aggregate counts from this stream + let mut stream_total = 0usize; + if !stream_replacement_counts.is_empty() { + for (k, v) in stream_replacement_counts.iter() { + *replacement_counts.entry(k.clone()).or_insert(0) += *v; + stream_total += *v; + } + } + + if stream_total > 0 { + streams_modified += 1; + } + + // write into output data + for (idx, &byte) in new_compressed_data.iter().enumerate() { + output_data[data_start + idx] = byte; + } + debug!( + "Compression: {} → {} B", + compressed_data.len(), + new_compressed_data.len() + ); + + let padding_len = compressed_data.len() - new_compressed_data.len(); + for idx in new_compressed_data.len()..compressed_data.len() { + output_data[data_start + idx] = 0x00; + } + + if padding_len > 0 { + info!( + "Applied padding of {} bytes to stream at {}", + padding_len, data_start + ); + } + } + + info!("Saving: {}", output_path.display()); + File::create(output_path)?.write_all(&output_data)?; + + info!("DONE!"); + info!( + "Streams: total={} modified={}", + streams_total, streams_modified + ); + let replacements_total: usize = replacement_counts.values().sum(); + info!("Replacements total: {}", replacements_total); + if !replacement_counts.is_empty() { + info!("Breakdown:"); + for (k, v) in replacement_counts.iter() { + info!(" '{}' -> {}", k, v); + } + } + info!("File: {}", output_path.display()); + + Ok(()) +}