Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,12 @@ private static class SerializationState {
private SerializationState(
VariantMetadata metadata,
VariantObject unshredded,
Map<String, VariantValue> shreddedFields,
Map<String, VariantValue> shredded,
Set<String> removedFields) {
this.metadata = metadata;
// field ID size is the size needed to store the largest field ID in the data
this.fieldIdSize = VariantUtil.sizeOf(metadata.dictionarySize());
this.shreddedFields = Maps.newHashMap(shreddedFields);
this.shreddedFields = Maps.newHashMap(shredded);

int totalDataSize = 0;
// get the unshredded field names and values as byte buffers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,28 @@ public void testPartiallyShreddedObjectSerializationMinimalBuffer() {
.isEqualTo(DateTimeUtil.isoDateToDays("2024-10-12"));
}

@Test
public void testPartiallyShreddedUnserializedObjectSerializationMinimalBuffer() {
ShreddedObject partial = createUnserializedObject(FIELDS);
VariantMetadata metadata = partial.metadata();

// replace field c with a new value
partial.put("c", Variants.ofIsoDate("2024-10-12"));
partial.remove("b");

VariantValue value = roundTripMinimalBuffer(partial, metadata);

assertThat(value).isInstanceOf(SerializedObject.class);
SerializedObject actual = (SerializedObject) value;

assertThat(actual.get("a")).isInstanceOf(VariantPrimitive.class);
assertThat(actual.get("a").asPrimitive().get()).isEqualTo(34);
assertThat(actual.get("c")).isInstanceOf(VariantPrimitive.class);
assertThat(actual.get("c").type()).isEqualTo(PhysicalType.DATE);
assertThat(actual.get("c").asPrimitive().get())
.isEqualTo(DateTimeUtil.isoDateToDays("2024-10-12"));
}

@Test
public void testPartiallyShreddedObjectSerializationLargeBuffer() {
ShreddedObject partial = createUnshreddedObject(FIELDS);
Expand Down Expand Up @@ -381,6 +403,12 @@ private static ShreddedObject createShreddedObject(
return object;
}

private static ShreddedObject createUnserializedObject(Map<String, VariantValue> fields) {
ByteBuffer metadataBuffer = VariantTestUtil.createMetadata(fields.keySet(), false);
VariantMetadata metadata = SerializedMetadata.from(metadataBuffer);
return new ShreddedObject(metadata, createShreddedObject(metadata, fields));
}

/** Creates a ShreddedObject with fields in its shredded map */
private static ShreddedObject createShreddedObject(Map<String, VariantValue> fields) {
ByteBuffer metadataBuffer = VariantTestUtil.createMetadata(fields.keySet(), false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.variants.ShreddedObject;
import org.apache.iceberg.variants.ValueArray;
import org.apache.iceberg.variants.Variant;
import org.apache.iceberg.variants.VariantArray;
Expand All @@ -52,6 +53,7 @@
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.FieldSource;

Expand Down Expand Up @@ -280,4 +282,53 @@ private static ValueArray array(VariantValue... values) {

return arr;
}

@Test
public void testPartialShreddingWithShreddedObject() throws IOException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't there a more direct way to test this in TestShreddedObject? I think it's fine to have an end-to-end test, but I'd like to see a more direct one that doesn't go through Parquet.

// Test for issue #15086: partial shredding with ShreddedObject created using put()
// Create a ShreddedObject with multiple fields, then partially shred it
VariantMetadata metadata = Variants.metadata("id", "name", "city");

// Create objects using ShreddedObject.put() instead of serialized buffers
List<Record> records = Lists.newArrayList();
for (int i = 0; i < 3; i++) {
ShreddedObject obj = Variants.object(metadata);
obj.put("id", Variants.of(1000L + i));
obj.put("name", Variants.of("user_" + i));
obj.put("city", Variants.of("city_" + i));

Variant variant = Variant.of(metadata, obj);
Record record = RECORD.copy("id", i, "var", variant);
records.add(record);
}

// Shredding function that only shreds the "id" field
VariantShreddingFunction partialShredding =
(id, name) -> {
VariantMetadata shreddedMetadata = Variants.metadata("id");
ShreddedObject shreddedObject = Variants.object(shreddedMetadata);
shreddedObject.put("id", Variants.of(1234L));
return ParquetVariantUtil.toParquetSchema(shreddedObject);
};

// Write and read back
List<Record> actual = writeAndRead(partialShredding, records);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think an end-to-end test should live in TestVariantReaders rather than TestVariantWriters. The problem you're trying to recreate is in the read path when a partially shredded object is being reconstructed. I think that the tests should also be in the read path.

Copy link
Contributor

@rdblue rdblue Jan 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked into this a bit more and ended up producing a test for TestShreddedObject that reproduces the problem. Here's my full diff of the test and the fix:

diff --git a/core/src/main/java/org/apache/iceberg/variants/ShreddedObject.java b/core/src/main/java/org/apache/iceberg/variants/ShreddedObject.java
index c1fba073cf..5d2bfc48fa 100644
--- a/core/src/main/java/org/apache/iceberg/variants/ShreddedObject.java
+++ b/core/src/main/java/org/apache/iceberg/variants/ShreddedObject.java
@@ -153,12 +153,12 @@ public class ShreddedObject implements VariantObject {
     private SerializationState(
         VariantMetadata metadata,
         VariantObject unshredded,
-        Map<String, VariantValue> shreddedFields,
+        Map<String, VariantValue> shredded,
         Set<String> removedFields) {
       this.metadata = metadata;
       // field ID size is the size needed to store the largest field ID in the data
       this.fieldIdSize = VariantUtil.sizeOf(metadata.dictionarySize());
-      this.shreddedFields = Maps.newHashMap(shreddedFields);
+      this.shreddedFields = Maps.newHashMap(shredded);
 
       int totalDataSize = 0;
       // get the unshredded field names and values as byte buffers
diff --git a/core/src/test/java/org/apache/iceberg/variants/TestShreddedObject.java b/core/src/test/java/org/apache/iceberg/variants/TestShreddedObject.java
index 1ebb433a54..0cb043d420 100644
--- a/core/src/test/java/org/apache/iceberg/variants/TestShreddedObject.java
+++ b/core/src/test/java/org/apache/iceberg/variants/TestShreddedObject.java
@@ -213,6 +213,28 @@ public class TestShreddedObject {
         .isEqualTo(DateTimeUtil.isoDateToDays("2024-10-12"));
   }
 
+  @Test
+  public void testPartiallyShreddedUnserializedObjectSerializationMinimalBuffer() {
+    ShreddedObject partial = createUnserializedObject(FIELDS);
+    VariantMetadata metadata = partial.metadata();
+
+    // replace field c with a new value
+    partial.put("c", Variants.ofIsoDate("2024-10-12"));
+    partial.remove("b");
+
+    VariantValue value = roundTripMinimalBuffer(partial, metadata);
+
+    assertThat(value).isInstanceOf(SerializedObject.class);
+    SerializedObject actual = (SerializedObject) value;
+
+    assertThat(actual.get("a")).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get("a").asPrimitive().get()).isEqualTo(34);
+    assertThat(actual.get("c")).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get("c").type()).isEqualTo(PhysicalType.DATE);
+    assertThat(actual.get("c").asPrimitive().get())
+        .isEqualTo(DateTimeUtil.isoDateToDays("2024-10-12"));
+  }
+
   @Test
   public void testPartiallyShreddedObjectSerializationLargeBuffer() {
     ShreddedObject partial = createUnshreddedObject(FIELDS);
@@ -370,6 +392,12 @@ public class TestShreddedObject {
     return Variants.value(metadata, slice);
   }
 
+  private static ShreddedObject createUnserializedObject(Map<String, VariantValue> fields) {
+    ByteBuffer metadataBuffer = VariantTestUtil.createMetadata(fields.keySet(), false);
+    VariantMetadata metadata = SerializedMetadata.from(metadataBuffer);
+    return new ShreddedObject(metadata, createShreddedObject(metadata, fields));
+  }
+
   /** Creates a ShreddedObject with fields in its shredded map, using the given metadata */
   private static ShreddedObject createShreddedObject(
       VariantMetadata metadata, Map<String, VariantValue> fields) {

Feel free to use that test case. It would also be nice to have a Parquet test like this one. If you end up keeping it, please use helper methods to create the Parquet schema (see ParquetVariantUtil).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. I'll include your test case, and I'd like also to keep my test case.

I change the code from construcing parquet schema manually to using ParquetVariantUtil to infer the parquet schema.


// Verify all records match
assertThat(actual).hasSameSizeAs(records);
for (int i = 0; i < records.size(); i++) {
Record expected = records.get(i);
Record read = actual.get(i);

InternalTestHelpers.assertEquals(SCHEMA.asStruct(), expected, read);

// Also verify the variant object has all fields intact
Variant readVariant = (Variant) read.getField("var");
VariantObject readObj = readVariant.value().asObject();
assertThat(readObj.numFields()).isEqualTo(3);
assertThat(readObj.get("id").asPrimitive().get()).isEqualTo(1000L + i);
assertThat(readObj.get("name").asPrimitive().get()).isEqualTo("user_" + i);
assertThat(readObj.get("city").asPrimitive().get()).isEqualTo("city_" + i);
}
}
}