From 6d05a7ae31016868343d4a23f9b4bd05748d1838 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 16 Jan 2026 12:40:03 -0800 Subject: [PATCH 1/9] reject leading and repeated commas --- arrow-json/src/reader/tape.rs | 208 +++++++++++++++++++++++++++++++--- 1 file changed, 190 insertions(+), 18 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 89ee3f778765..fc0612e8fcba 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -216,14 +216,22 @@ impl<'a> Tape<'a> { /// States based on #[derive(Debug, Copy, Clone)] enum DecoderState { - /// Decoding an object + /// Decoding an object - awaiting a '"' (new field) or '}' (done) /// /// Contains index of start [`TapeElement::StartObject`] Object(u32), - /// Decoding a list + /// Continue decoding an object - awaiting ',' (before next field) or '}' (done) + /// + /// Contains index of start [`TapeElement::StartObject`] + ContinueObject(u32), + /// Decoding a list - awaiting first element or ']' (done) /// /// Contains index of start [`TapeElement::StartList`] List(u32), + /// Continue decoding a list - awaiting ',' (before next element) or ']' (done) + /// + /// Contains index of start [`TapeElement::StartList`] + ContinueList(u32), String, Value, Number, @@ -242,8 +250,8 @@ enum DecoderState { impl DecoderState { fn as_str(&self) -> &'static str { match self { - DecoderState::Object(_) => "object", - DecoderState::List(_) => "list", + DecoderState::Object(_) | DecoderState::ContinueObject(_) => "object", + DecoderState::List(_) | DecoderState::ContinueList(_) => "list", DecoderState::String => "string", DecoderState::Value => "value", DecoderState::Number => "number", @@ -338,6 +346,20 @@ impl TapeDecoder { } } + /// Write the closing elements for an object to the tape + fn write_end_object(&mut self, start_idx: u32) { + let end_idx = self.elements.len() as u32; + self.elements[start_idx as usize] = TapeElement::StartObject(end_idx); + self.elements.push(TapeElement::EndObject(start_idx)); + } + + /// Write the closing elements for a list to the tape + fn write_end_list(&mut self, start_idx: u32) { + let end_idx = self.elements.len() as u32; + self.elements[start_idx as usize] = TapeElement::StartList(end_idx); + self.elements.push(TapeElement::EndList(start_idx)); + } + pub fn decode(&mut self, buf: &[u8]) -> Result { let mut iter = BufIter::new(buf); @@ -358,38 +380,71 @@ impl TapeDecoder { }; match state { - // Decoding an object + // Decoding an object - awaiting next field or '}' DecoderState::Object(start_idx) => { - iter.advance_until(|b| !json_whitespace(b) && b != b','); + let start_idx = *start_idx; + iter.skip_whitespace(); match next!(iter) { b'"' => { + *state = DecoderState::ContinueObject(start_idx); self.stack.push(DecoderState::Value); self.stack.push(DecoderState::Colon); self.stack.push(DecoderState::String); } b'}' => { - let start_idx = *start_idx; - let end_idx = self.elements.len() as u32; - self.elements[start_idx as usize] = TapeElement::StartObject(end_idx); - self.elements.push(TapeElement::EndObject(start_idx)); + self.write_end_object(start_idx); + self.stack.pop(); + } + b => return Err(err(b, "expected '\"' or '}'")), + } + } + // Continue decoding an object - awaiting ',' or '}' + DecoderState::ContinueObject(start_idx) => { + let start_idx = *start_idx; + iter.skip_whitespace(); + match next!(iter) { + b',' => { + *state = DecoderState::Object(start_idx); + } + b'}' => { + self.write_end_object(start_idx); self.stack.pop(); } - b => return Err(err(b, "parsing object")), + b => return Err(err(b, "expected ',' or '}'")), } } - // Decoding a list + // Decoding a list - awaiting next element or ']' DecoderState::List(start_idx) => { - iter.advance_until(|b| !json_whitespace(b) && b != b','); + let start_idx = *start_idx; + iter.skip_whitespace(); + match iter.peek() { + Some(b']') => { + iter.next(); + self.write_end_list(start_idx); + self.stack.pop(); + } + Some(_) => { + *state = DecoderState::ContinueList(start_idx); + self.stack.push(DecoderState::Value); + } + None => break, + } + } + // Continue decoding a list - awaiting ',' or ']' + DecoderState::ContinueList(start_idx) => { + let start_idx = *start_idx; + iter.skip_whitespace(); match iter.peek() { + Some(b',') => { + iter.next(); + *state = DecoderState::List(start_idx); + } Some(b']') => { iter.next(); - let start_idx = *start_idx; - let end_idx = self.elements.len() as u32; - self.elements[start_idx as usize] = TapeElement::StartList(end_idx); - self.elements.push(TapeElement::EndList(start_idx)); + self.write_end_list(start_idx); self.stack.pop(); } - Some(_) => self.stack.push(DecoderState::Value), + Some(b) => return Err(err(b, "expected ',' or ']'")), None => break, } } @@ -972,4 +1027,121 @@ mod tests { let res = decoder.decode(b"{\"test\": \"\\udc00\\udc01\"}"); assert!(res.is_err()); } + + #[test] + fn test_valid_comma_usage() { + // Verify that valid JSON with proper comma usage still works + + // Valid object with commas + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"{"a": 1, "b": 2, "c": 3}"#; + decoder.decode(json.as_bytes()).unwrap(); + let tape = decoder.finish().unwrap(); + let mut s = String::new(); + tape.serialize(&mut s, 1); + assert!(s.contains("\"a\"")); + assert!(s.contains("\"b\"")); + assert!(s.contains("\"c\"")); + + // Valid list with commas + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"[1, 2, 3, 4]"#; + decoder.decode(json.as_bytes()).unwrap(); + let tape = decoder.finish().unwrap(); + let mut s = String::new(); + tape.serialize(&mut s, 1); + assert!(s.contains("1")); + assert!(s.contains("2")); + assert!(s.contains("3")); + assert!(s.contains("4")); + + // Empty object (no commas needed) + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"{}"#; + decoder.decode(json.as_bytes()).unwrap(); + decoder.finish().unwrap(); + + // Empty list (no commas needed) + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"[]"#; + decoder.decode(json.as_bytes()).unwrap(); + decoder.finish().unwrap(); + } + + #[test] + fn test_reject_invalid_commas_in_objects() { + // Verify that the parser correctly rejects invalid JSON with extra commas in objects + + // Empty with comma - should reject + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"{,}"#; + let err = decoder.decode(json.as_bytes()).unwrap_err().to_string(); + assert!(err.contains("expected '\"' or '}'"), "Error was: {}", err); + + // Leading comma - should reject + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"{, "field": 10}"#; + let err = decoder.decode(json.as_bytes()).unwrap_err().to_string(); + assert!(err.contains("expected '\"' or '}'"), "Error was: {}", err); + + // Double comma between fields - should reject + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"{"a": 1,, "b": 2}"#; + let err = decoder.decode(json.as_bytes()).unwrap_err().to_string(); + assert!(err.contains("expected '\"' or '}'"), "Error was: {}", err); + + // Multiple commas - should reject + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"{"a": 1,,,, "b": 2}"#; + let err = decoder.decode(json.as_bytes()).unwrap_err().to_string(); + assert!(err.contains("expected '\"' or '}'"), "Error was: {}", err); + + // Trailing comma - intentionally allowed + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"{"a": 1,}"#; + decoder.decode(json.as_bytes()).unwrap(); + let tape = decoder.finish().unwrap(); + let mut s = String::new(); + tape.serialize(&mut s, 1); + assert!(s.contains("\"a\"")); + } + + #[test] + fn test_reject_invalid_commas_in_lists() { + // Verify that the parser correctly rejects invalid JSON with extra commas in lists + + // Empty with comma - should reject + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"[,]"#; + let err = decoder.decode(json.as_bytes()).unwrap_err().to_string(); + assert!(err.contains("parsing value"), "Error was: {}", err); + + // Leading comma - should reject + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"[, 1, 2]"#; + let err = decoder.decode(json.as_bytes()).unwrap_err().to_string(); + assert!(err.contains("parsing value"), "Error was: {}", err); + + // Double comma between elements - should reject + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"[1,, 2, 3]"#; + let err = decoder.decode(json.as_bytes()).unwrap_err().to_string(); + assert!(err.contains("parsing value"), "Error was: {}", err); + + // Multiple commas - should reject + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"[1,,,, 2]"#; + let err = decoder.decode(json.as_bytes()).unwrap_err().to_string(); + assert!(err.contains("parsing value"), "Error was: {}", err); + + // Trailing comma - intentionally allowed + let mut decoder = TapeDecoder::new(16, 2); + let json = r#"[1, 2,]"#; + decoder.decode(json.as_bytes()).unwrap(); + let tape = decoder.finish().unwrap(); + let mut s = String::new(); + tape.serialize(&mut s, 1); + assert!(s.contains("1")); + assert!(s.contains("2")); + } } From 83b99609aa1c6c119430f26fd26146a2a13c3956 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 16 Jan 2026 15:08:00 -0800 Subject: [PATCH 2/9] optimize whitespace --- arrow-json/src/reader/tape.rs | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index fc0612e8fcba..67a0816cd18e 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -367,8 +367,7 @@ impl TapeDecoder { let state = match self.stack.last_mut() { Some(l) => l, None => { - iter.skip_whitespace(); - if iter.is_empty() || self.cur_row >= self.batch_size { + if iter.skip_whitespace().is_none() || self.cur_row >= self.batch_size { break; } @@ -416,8 +415,7 @@ impl TapeDecoder { // Decoding a list - awaiting next element or ']' DecoderState::List(start_idx) => { let start_idx = *start_idx; - iter.skip_whitespace(); - match iter.peek() { + match iter.skip_whitespace() { Some(b']') => { iter.next(); self.write_end_list(start_idx); @@ -433,8 +431,7 @@ impl TapeDecoder { // Continue decoding a list - awaiting ',' or ']' DecoderState::ContinueList(start_idx) => { let start_idx = *start_idx; - iter.skip_whitespace(); - match iter.peek() { + match iter.skip_whitespace() { Some(b',') => { iter.next(); *state = DecoderState::List(start_idx); @@ -731,8 +728,24 @@ impl<'a> BufIter<'a> { } } - fn skip_whitespace(&mut self) { - self.advance_until(|b| !json_whitespace(b)); + // Skip to the next non-whitespace char and return a peek at it + fn skip_whitespace(&mut self) -> Option { + let s = self.as_slice(); + match s + .iter() + .copied() + .enumerate() + .find(|(_, b)| !json_whitespace(*b)) + { + Some((x, b)) => { + self.advance(x); + Some(b) + } + None => { + self.advance(s.len()); + None + } + } } } From 13145446e2a17c6def73e19c2fc4632e956f29d0 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 16 Jan 2026 15:10:34 -0800 Subject: [PATCH 3/9] optimize next --- arrow-json/src/reader/tape.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 67a0816cd18e..661982b2555b 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -753,9 +753,9 @@ impl Iterator for BufIter<'_> { type Item = u8; fn next(&mut self) -> Option { - let b = self.peek(); + let b = self.peek()?; self.pos += 1; - b + Some(b) } fn size_hint(&self) -> (usize, Option) { From dfd890d97dbb0e80a502b4c159721b4b964cd9dd Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 16 Jan 2026 15:44:09 -0800 Subject: [PATCH 4/9] optimize more whitespace --- arrow-json/src/reader/tape.rs | 44 ++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 661982b2555b..6ea38773e325 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -302,6 +302,16 @@ macro_rules! next { }; } +/// Evaluates to the next non-whitespace byte in the iterator or breaks the current loop +macro_rules! next_non_whitespace { + ($next:ident) => { + match $next.next_non_whitespace() { + Some(b) => b, + None => break, + } + }; +} + /// Implements a state machine for decoding JSON to a tape pub struct TapeDecoder { elements: Vec, @@ -382,8 +392,7 @@ impl TapeDecoder { // Decoding an object - awaiting next field or '}' DecoderState::Object(start_idx) => { let start_idx = *start_idx; - iter.skip_whitespace(); - match next!(iter) { + match next_non_whitespace!(iter) { b'"' => { *state = DecoderState::ContinueObject(start_idx); self.stack.push(DecoderState::Value); @@ -400,8 +409,7 @@ impl TapeDecoder { // Continue decoding an object - awaiting ',' or '}' DecoderState::ContinueObject(start_idx) => { let start_idx = *start_idx; - iter.skip_whitespace(); - match next!(iter) { + match next_non_whitespace!(iter) { b',' => { *state = DecoderState::Object(start_idx); } @@ -461,9 +469,8 @@ impl TapeDecoder { b => unreachable!("{}", b), } } - state @ DecoderState::Value => { - iter.skip_whitespace(); - *state = match next!(iter) { + DecoderState::Value => { + *state = match next_non_whitespace!(iter) { b'"' => DecoderState::String, b @ b'-' | b @ b'0'..=b'9' => { self.bytes.push(b); @@ -499,8 +506,7 @@ impl TapeDecoder { } } DecoderState::Colon => { - iter.skip_whitespace(); - match next!(iter) { + match next_non_whitespace!(iter) { b':' => self.stack.pop(), b => return Err(err(b, "parsing colon")), }; @@ -747,6 +753,26 @@ impl<'a> BufIter<'a> { } } } + + // Advance to and consume the next non-whitespace char + fn next_non_whitespace(&mut self) -> Option { + let s = self.as_slice(); + match s + .iter() + .copied() + .enumerate() + .find(|(_, b)| !json_whitespace(*b)) + { + Some((x, b)) => { + self.advance(x + 1); + Some(b) + } + None => { + self.advance(s.len()); + None + } + } + } } impl Iterator for BufIter<'_> { From 1bd8ff6d978b600a428e027ff6ca1f29f00d1c9a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 16 Jan 2026 15:59:42 -0800 Subject: [PATCH 5/9] actually optimize whitespace --- arrow-json/src/reader/tape.rs | 42 ++++++++++------------------------- 1 file changed, 12 insertions(+), 30 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 6ea38773e325..17f3bf7dd245 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -734,44 +734,26 @@ impl<'a> BufIter<'a> { } } - // Skip to the next non-whitespace char and return a peek at it + // Advance to the next non-whitespace char and peek at it fn skip_whitespace(&mut self) -> Option { - let s = self.as_slice(); - match s - .iter() - .copied() - .enumerate() - .find(|(_, b)| !json_whitespace(*b)) - { - Some((x, b)) => { - self.advance(x); - Some(b) - } - None => { - self.advance(s.len()); - None + for b in self.as_slice() { + if !json_whitespace(*b) { + return Some(*b); } + self.pos += 1; } + None } - // Advance to and consume the next non-whitespace char + // Advance to the next non-whitespace char and consume it fn next_non_whitespace(&mut self) -> Option { - let s = self.as_slice(); - match s - .iter() - .copied() - .enumerate() - .find(|(_, b)| !json_whitespace(*b)) - { - Some((x, b)) => { - self.advance(x + 1); - Some(b) - } - None => { - self.advance(s.len()); - None + for b in self.as_slice() { + self.pos += 1; + if !json_whitespace(*b) { + return Some(*b); } } + None } } From b37a11dcce50239972ca8816c5a4453e69dec0e1 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 20 Jan 2026 10:26:10 -0800 Subject: [PATCH 6/9] simplify a bit --- arrow-json/src/reader/tape.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 17f3bf7dd245..dbbed982368c 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -423,34 +423,30 @@ impl TapeDecoder { // Decoding a list - awaiting next element or ']' DecoderState::List(start_idx) => { let start_idx = *start_idx; - match iter.skip_whitespace() { - Some(b']') => { - iter.next(); + match next_non_whitespace!(iter) { + b']' => { self.write_end_list(start_idx); self.stack.pop(); } - Some(_) => { + _ => { + iter.pos -= 1; // rewind so ContinueList can consume the byte *state = DecoderState::ContinueList(start_idx); self.stack.push(DecoderState::Value); } - None => break, } } // Continue decoding a list - awaiting ',' or ']' DecoderState::ContinueList(start_idx) => { let start_idx = *start_idx; - match iter.skip_whitespace() { - Some(b',') => { - iter.next(); + match next_non_whitespace!(iter) { + b',' => { *state = DecoderState::List(start_idx); } - Some(b']') => { - iter.next(); + b']' => { self.write_end_list(start_idx); self.stack.pop(); } - Some(b) => return Err(err(b, "expected ',' or ']'")), - None => break, + b => return Err(err(b, "expected ',' or ']'")), } } // Decoding a string From 58c21e7359576668e7f1f5b3dc15e758f7bb5f03 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 20 Jan 2026 11:10:33 -0800 Subject: [PATCH 7/9] inline the old Value state --- arrow-json/src/reader/tape.rs | 114 ++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 32 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index dbbed982368c..4bdca1c2dcb4 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -219,20 +219,23 @@ enum DecoderState { /// Decoding an object - awaiting a '"' (new field) or '}' (done) /// /// Contains index of start [`TapeElement::StartObject`] + /// This state handles both the initial `{` and after `,` Object(u32), - /// Continue decoding an object - awaiting ',' (before next field) or '}' (done) + /// After a value in an object member - awaiting ',' (next field) or '}' (done) /// /// Contains index of start [`TapeElement::StartObject`] - ContinueObject(u32), - /// Decoding a list - awaiting first element or ']' (done) + ObjectAfterValue(u32), + /// Decoding a list - awaiting a value or ']' (done) /// /// Contains index of start [`TapeElement::StartList`] + /// This state handles both the initial `[` and after `,` List(u32), - /// Continue decoding a list - awaiting ',' (before next element) or ']' (done) + /// After a value in a list - awaiting ',' (next element) or ']' (done) /// /// Contains index of start [`TapeElement::StartList`] - ContinueList(u32), + ListAfterValue(u32), String, + /// Skip whitespace and detect value type Value, Number, Colon, @@ -250,8 +253,8 @@ enum DecoderState { impl DecoderState { fn as_str(&self) -> &'static str { match self { - DecoderState::Object(_) | DecoderState::ContinueObject(_) => "object", - DecoderState::List(_) | DecoderState::ContinueList(_) => "list", + DecoderState::Object(_) | DecoderState::ObjectAfterValue(_) => "object", + DecoderState::List(_) | DecoderState::ListAfterValue(_) => "list", DecoderState::String => "string", DecoderState::Value => "value", DecoderState::Number => "number", @@ -377,25 +380,52 @@ impl TapeDecoder { let state = match self.stack.last_mut() { Some(l) => l, None => { - if iter.skip_whitespace().is_none() || self.cur_row >= self.batch_size { + if self.cur_row >= self.batch_size { break; } + let b = match iter.next_non_whitespace() { + Some(b) => b, + None => break, + }; + // Start of row self.cur_row += 1; - self.stack.push(DecoderState::Value); + + // Detect value type and push appropriate state + match b { + b'"' => self.stack.push(DecoderState::String), + b @ (b'-' | b'0'..=b'9') => { + self.bytes.push(b); + self.stack.push(DecoderState::Number); + } + b'n' => self.stack.push(DecoderState::Literal(Literal::Null, 1)), + b'f' => self.stack.push(DecoderState::Literal(Literal::False, 1)), + b't' => self.stack.push(DecoderState::Literal(Literal::True, 1)), + b'[' => { + let idx = self.elements.len() as u32; + self.elements.push(TapeElement::StartList(u32::MAX)); + self.stack.push(DecoderState::List(idx)); + } + b'{' => { + let idx = self.elements.len() as u32; + self.elements.push(TapeElement::StartObject(u32::MAX)); + self.stack.push(DecoderState::Object(idx)); + } + b => return Err(err(b, "parsing value")), + } + self.stack.last_mut().unwrap() } }; match state { - // Decoding an object - awaiting next field or '}' + // Expecting object member or close brace DecoderState::Object(start_idx) => { let start_idx = *start_idx; match next_non_whitespace!(iter) { b'"' => { - *state = DecoderState::ContinueObject(start_idx); - self.stack.push(DecoderState::Value); + *state = DecoderState::ObjectAfterValue(start_idx); self.stack.push(DecoderState::Colon); self.stack.push(DecoderState::String); } @@ -406,8 +436,8 @@ impl TapeDecoder { b => return Err(err(b, "expected '\"' or '}'")), } } - // Continue decoding an object - awaiting ',' or '}' - DecoderState::ContinueObject(start_idx) => { + // After value in object - expecting comma or close brace + DecoderState::ObjectAfterValue(start_idx) => { let start_idx = *start_idx; match next_non_whitespace!(iter) { b',' => { @@ -428,15 +458,45 @@ impl TapeDecoder { self.write_end_list(start_idx); self.stack.pop(); } - _ => { - iter.pos -= 1; // rewind so ContinueList can consume the byte - *state = DecoderState::ContinueList(start_idx); - self.stack.push(DecoderState::Value); + // Inline value detection to avoid redundant WS check + b'"' => { + *state = DecoderState::ListAfterValue(start_idx); + self.stack.push(DecoderState::String); + } + b @ (b'-' | b'0'..=b'9') => { + self.bytes.push(b); + *state = DecoderState::ListAfterValue(start_idx); + self.stack.push(DecoderState::Number); } + b'n' => { + *state = DecoderState::ListAfterValue(start_idx); + self.stack.push(DecoderState::Literal(Literal::Null, 1)); + } + b'f' => { + *state = DecoderState::ListAfterValue(start_idx); + self.stack.push(DecoderState::Literal(Literal::False, 1)); + } + b't' => { + *state = DecoderState::ListAfterValue(start_idx); + self.stack.push(DecoderState::Literal(Literal::True, 1)); + } + b'[' => { + let idx = self.elements.len() as u32; + self.elements.push(TapeElement::StartList(u32::MAX)); + *state = DecoderState::ListAfterValue(start_idx); + self.stack.push(DecoderState::List(idx)); + } + b'{' => { + let idx = self.elements.len() as u32; + self.elements.push(TapeElement::StartObject(u32::MAX)); + *state = DecoderState::ListAfterValue(start_idx); + self.stack.push(DecoderState::Object(idx)); + } + b => return Err(err(b, "parsing value")), } } - // Continue decoding a list - awaiting ',' or ']' - DecoderState::ContinueList(start_idx) => { + // After value in a list - expecting comma or close bracket + DecoderState::ListAfterValue(start_idx) => { let start_idx = *start_idx; match next_non_whitespace!(iter) { b',' => { @@ -465,6 +525,7 @@ impl TapeDecoder { b => unreachable!("{}", b), } } + // Skip whitespace and detect value type DecoderState::Value => { *state = match next_non_whitespace!(iter) { b'"' => DecoderState::String, @@ -503,7 +564,7 @@ impl TapeDecoder { } DecoderState::Colon => { match next_non_whitespace!(iter) { - b':' => self.stack.pop(), + b':' => *state = DecoderState::Value, b => return Err(err(b, "parsing colon")), }; } @@ -730,17 +791,6 @@ impl<'a> BufIter<'a> { } } - // Advance to the next non-whitespace char and peek at it - fn skip_whitespace(&mut self) -> Option { - for b in self.as_slice() { - if !json_whitespace(*b) { - return Some(*b); - } - self.pos += 1; - } - None - } - // Advance to the next non-whitespace char and consume it fn next_non_whitespace(&mut self) -> Option { for b in self.as_slice() { From c9df425e87998503bda19525b30f9d9fb8f77a1d Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 20 Jan 2026 11:34:46 -0800 Subject: [PATCH 8/9] factor out a macro --- arrow-json/src/reader/tape.rs | 151 +++++++++++----------------------- 1 file changed, 50 insertions(+), 101 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 4bdca1c2dcb4..c2025c8a190c 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -315,6 +315,35 @@ macro_rules! next_non_whitespace { }; } +/// Dispatches value type detection with optional special case and custom transition function +macro_rules! dispatch_value { + ($self:ident, $b:expr, |$s:ident| $transition:expr $(, $special:pat => $special_body:expr)?) => {{ + let $s = match $b { + $($special => $special_body,)? + b'"' => DecoderState::String, + b @ (b'-' | b'0'..=b'9') => { + $self.bytes.push(b); + DecoderState::Number + } + b'n' => DecoderState::Literal(Literal::Null, 1), + b'f' => DecoderState::Literal(Literal::False, 1), + b't' => DecoderState::Literal(Literal::True, 1), + b'[' => { + let idx = $self.elements.len() as u32; + $self.elements.push(TapeElement::StartList(u32::MAX)); + DecoderState::List(idx) + } + b'{' => { + let idx = $self.elements.len() as u32; + $self.elements.push(TapeElement::StartObject(u32::MAX)); + DecoderState::Object(idx) + } + b => return Err(err(b, "parsing value")), + }; + $transition + }}; +} + /// Implements a state machine for decoding JSON to a tape pub struct TapeDecoder { elements: Vec, @@ -360,17 +389,19 @@ impl TapeDecoder { } /// Write the closing elements for an object to the tape - fn write_end_object(&mut self, start_idx: u32) { + fn end_object(&mut self, start_idx: u32) { let end_idx = self.elements.len() as u32; self.elements[start_idx as usize] = TapeElement::StartObject(end_idx); self.elements.push(TapeElement::EndObject(start_idx)); + self.stack.pop(); } /// Write the closing elements for a list to the tape - fn write_end_list(&mut self, start_idx: u32) { + fn end_list(&mut self, start_idx: u32) { let end_idx = self.elements.len() as u32; self.elements[start_idx as usize] = TapeElement::StartList(end_idx); self.elements.push(TapeElement::EndList(start_idx)); + self.stack.pop(); } pub fn decode(&mut self, buf: &[u8]) -> Result { @@ -393,27 +424,7 @@ impl TapeDecoder { self.cur_row += 1; // Detect value type and push appropriate state - match b { - b'"' => self.stack.push(DecoderState::String), - b @ (b'-' | b'0'..=b'9') => { - self.bytes.push(b); - self.stack.push(DecoderState::Number); - } - b'n' => self.stack.push(DecoderState::Literal(Literal::Null, 1)), - b'f' => self.stack.push(DecoderState::Literal(Literal::False, 1)), - b't' => self.stack.push(DecoderState::Literal(Literal::True, 1)), - b'[' => { - let idx = self.elements.len() as u32; - self.elements.push(TapeElement::StartList(u32::MAX)); - self.stack.push(DecoderState::List(idx)); - } - b'{' => { - let idx = self.elements.len() as u32; - self.elements.push(TapeElement::StartObject(u32::MAX)); - self.stack.push(DecoderState::Object(idx)); - } - b => return Err(err(b, "parsing value")), - } + dispatch_value!(self, b, |s| self.stack.push(s)); self.stack.last_mut().unwrap() } @@ -429,10 +440,7 @@ impl TapeDecoder { self.stack.push(DecoderState::Colon); self.stack.push(DecoderState::String); } - b'}' => { - self.write_end_object(start_idx); - self.stack.pop(); - } + b'}' => self.end_object(start_idx), b => return Err(err(b, "expected '\"' or '}'")), } } @@ -440,72 +448,33 @@ impl TapeDecoder { DecoderState::ObjectAfterValue(start_idx) => { let start_idx = *start_idx; match next_non_whitespace!(iter) { - b',' => { - *state = DecoderState::Object(start_idx); - } - b'}' => { - self.write_end_object(start_idx); - self.stack.pop(); - } + b',' => *state = DecoderState::Object(start_idx), + b'}' => self.end_object(start_idx), b => return Err(err(b, "expected ',' or '}'")), } } // Decoding a list - awaiting next element or ']' DecoderState::List(start_idx) => { let start_idx = *start_idx; - match next_non_whitespace!(iter) { - b']' => { - self.write_end_list(start_idx); - self.stack.pop(); - } - // Inline value detection to avoid redundant WS check - b'"' => { - *state = DecoderState::ListAfterValue(start_idx); - self.stack.push(DecoderState::String); - } - b @ (b'-' | b'0'..=b'9') => { - self.bytes.push(b); + dispatch_value!( + self, + next_non_whitespace!(iter), + |s| { *state = DecoderState::ListAfterValue(start_idx); - self.stack.push(DecoderState::Number); - } - b'n' => { - *state = DecoderState::ListAfterValue(start_idx); - self.stack.push(DecoderState::Literal(Literal::Null, 1)); - } - b'f' => { - *state = DecoderState::ListAfterValue(start_idx); - self.stack.push(DecoderState::Literal(Literal::False, 1)); - } - b't' => { - *state = DecoderState::ListAfterValue(start_idx); - self.stack.push(DecoderState::Literal(Literal::True, 1)); - } - b'[' => { - let idx = self.elements.len() as u32; - self.elements.push(TapeElement::StartList(u32::MAX)); - *state = DecoderState::ListAfterValue(start_idx); - self.stack.push(DecoderState::List(idx)); - } - b'{' => { - let idx = self.elements.len() as u32; - self.elements.push(TapeElement::StartObject(u32::MAX)); - *state = DecoderState::ListAfterValue(start_idx); - self.stack.push(DecoderState::Object(idx)); + self.stack.push(s); + }, + b']' => { + self.end_list(start_idx); + continue; } - b => return Err(err(b, "parsing value")), - } + ); } // After value in a list - expecting comma or close bracket DecoderState::ListAfterValue(start_idx) => { let start_idx = *start_idx; match next_non_whitespace!(iter) { - b',' => { - *state = DecoderState::List(start_idx); - } - b']' => { - self.write_end_list(start_idx); - self.stack.pop(); - } + b',' => *state = DecoderState::List(start_idx), + b']' => self.end_list(start_idx), b => return Err(err(b, "expected ',' or ']'")), } } @@ -527,27 +496,7 @@ impl TapeDecoder { } // Skip whitespace and detect value type DecoderState::Value => { - *state = match next_non_whitespace!(iter) { - b'"' => DecoderState::String, - b @ b'-' | b @ b'0'..=b'9' => { - self.bytes.push(b); - DecoderState::Number - } - b'n' => DecoderState::Literal(Literal::Null, 1), - b'f' => DecoderState::Literal(Literal::False, 1), - b't' => DecoderState::Literal(Literal::True, 1), - b'[' => { - let idx = self.elements.len() as u32; - self.elements.push(TapeElement::StartList(u32::MAX)); - DecoderState::List(idx) - } - b'{' => { - let idx = self.elements.len() as u32; - self.elements.push(TapeElement::StartObject(u32::MAX)); - DecoderState::Object(idx) - } - b => return Err(err(b, "parsing value")), - }; + *state = dispatch_value!(self, next_non_whitespace!(iter), |s| s); } DecoderState::Number => { let s = iter.advance_until(|b| { From 2120082c7e36bf7403eab519dbe42fe105f7c1a3 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 20 Jan 2026 14:28:12 -0800 Subject: [PATCH 9/9] cleanup --- arrow-json/src/reader/tape.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index c2025c8a190c..170115a12bc6 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -415,12 +415,8 @@ impl TapeDecoder { break; } - let b = match iter.next_non_whitespace() { - Some(b) => b, - None => break, - }; - // Start of row + let b = next_non_whitespace!(iter); self.cur_row += 1; // Detect value type and push appropriate state