diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml index 4ba3d8250d35..0cf335fe17ab 100644 --- a/.palantir/revapi.yml +++ b/.palantir/revapi.yml @@ -404,6 +404,114 @@ acceptedBreaks: old: "method org.apache.iceberg.orc.ORC.WriteBuilder org.apache.iceberg.orc.ORC.WriteBuilder::config(java.lang.String,\ \ java.lang.String)" justification: "Removing deprecations for 1.2.0" + "1.10.0": + org.apache.iceberg:iceberg-api: + - code: "java.class.defaultSerializationChanged" + old: "class org.apache.iceberg.encryption.EncryptingFileIO" + new: "class org.apache.iceberg.encryption.EncryptingFileIO" + justification: "New method for Manifest List reading" + - code: "java.method.addedToInterface" + new: "method org.apache.iceberg.UpdateSchema org.apache.iceberg.UpdateSchema::undeleteColumn(java.lang.String)" + justification: "Backwards compatible - only adds new spark functions" + org.apache.iceberg:iceberg-core: + - code: "java.class.noLongerInheritsFromClass" + old: "class org.apache.iceberg.rest.auth.OAuth2Manager" + new: "class org.apache.iceberg.rest.auth.OAuth2Manager" + justification: "Removing deprecations for 1.11.0" + - code: "java.class.nowImplementsInterface" + old: "class org.apache.iceberg.rest.auth.OAuth2Manager" + new: "class org.apache.iceberg.rest.auth.OAuth2Manager" + justification: "Removing deprecations for 1.11.0" + - code: "java.class.removed" + old: "class org.apache.iceberg.PartitionStatsUtil" + justification: "Removing deprecated code for 1.11.0" + - code: "java.class.removed" + old: "class org.apache.iceberg.rest.auth.RefreshingAuthManager" + justification: "Removing deprecations for 1.11.0" + - code: "java.field.constantValueChanged" + old: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN" + new: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN" + justification: "Plan API is table scoped and path constant value should include\ + \ namespace. No actual breakage because it never worked before with incorrect\ + \ value." + - code: "java.field.constantValueChanged" + old: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN_SUBMIT" + new: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN_SUBMIT" + justification: "Plan API is table scoped and path constant value should include\ + \ namespace. No actual breakage because it never worked before with incorrect\ + \ value." + - code: "java.field.constantValueChanged" + old: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN_TASKS" + new: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN_TASKS" + justification: "Plan API is table scoped and path constant value should include\ + \ namespace. No actual breakage because it never worked before with incorrect\ + \ value." + - code: "java.method.removed" + old: "method java.lang.String org.apache.iceberg.RewriteTablePathUtil::stagingPath(java.lang.String,\ + \ java.lang.String)" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.RewriteTablePathUtil.RewriteResult\ + \ org.apache.iceberg.RewriteTablePathUtil::rewriteDataManifest(org.apache.iceberg.ManifestFile,\ + \ org.apache.iceberg.io.OutputFile, org.apache.iceberg.io.FileIO, int, java.util.Map, java.lang.String, java.lang.String) throws\ + \ java.io.IOException" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.RewriteTablePathUtil.RewriteResult\ + \ org.apache.iceberg.RewriteTablePathUtil::rewriteDeleteManifest(org.apache.iceberg.ManifestFile,\ + \ org.apache.iceberg.io.OutputFile, org.apache.iceberg.io.FileIO, int, java.util.Map, java.lang.String, java.lang.String, java.lang.String)\ + \ throws java.io.IOException" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.Schema org.apache.iceberg.PartitionStatsHandler::schema(org.apache.iceberg.types.Types.StructType)" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.TableMetadata org.apache.iceberg.TableMetadataParser::read(org.apache.iceberg.io.FileIO,\ + \ org.apache.iceberg.io.InputFile)" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.encryption.EncryptionManager org.apache.iceberg.encryption.EncryptionUtil::createEncryptionManager(java.util.Map, org.apache.iceberg.encryption.KeyManagementClient)" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.rest.responses.OAuthTokenResponse org.apache.iceberg.rest.auth.OAuth2Util::exchangeToken(org.apache.iceberg.rest.RESTClient,\ + \ java.util.Map, java.lang.String, java.lang.String,\ + \ java.lang.String, java.lang.String, java.lang.String)" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.rest.responses.OAuthTokenResponse org.apache.iceberg.rest.auth.OAuth2Util::exchangeToken(org.apache.iceberg.rest.RESTClient,\ + \ java.util.Map, java.lang.String, java.lang.String,\ + \ java.lang.String, java.lang.String, java.lang.String, java.lang.String)" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.rest.responses.OAuthTokenResponse org.apache.iceberg.rest.auth.OAuth2Util::fetchToken(org.apache.iceberg.rest.RESTClient,\ + \ java.util.Map, java.lang.String, java.lang.String)" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.removed" + old: "method org.apache.iceberg.rest.responses.OAuthTokenResponse org.apache.iceberg.rest.auth.OAuth2Util::fetchToken(org.apache.iceberg.rest.RESTClient,\ + \ java.util.Map, java.lang.String, java.lang.String,\ + \ java.lang.String)" + justification: "Removing deprecated code for 1.11.0" + - code: "java.method.visibilityReduced" + old: "method void org.apache.iceberg.PartitionStats::appendStats(org.apache.iceberg.PartitionStats)" + new: "method void org.apache.iceberg.PartitionStats::appendStats(org.apache.iceberg.PartitionStats)" + justification: "Changing deprecated code" + - code: "java.method.visibilityReduced" + old: "method void org.apache.iceberg.PartitionStats::deletedEntry(org.apache.iceberg.Snapshot)" + new: "method void org.apache.iceberg.PartitionStats::deletedEntry(org.apache.iceberg.Snapshot)" + justification: "Changing deprecated code" + - code: "java.method.visibilityReduced" + old: "method void org.apache.iceberg.PartitionStats::liveEntry(org.apache.iceberg.ContentFile,\ + \ org.apache.iceberg.Snapshot)" + new: "method void org.apache.iceberg.PartitionStats::liveEntry(org.apache.iceberg.ContentFile,\ + \ org.apache.iceberg.Snapshot)" + justification: "Changing deprecated code" + org.apache.iceberg:iceberg-data: + - code: "java.class.removed" + old: "class org.apache.iceberg.data.PartitionStatsHandler" + justification: "Removing deprecated code for 1.11.0" "1.2.0": org.apache.iceberg:iceberg-api: - code: "java.field.constantValueChanged" @@ -1363,103 +1471,6 @@ acceptedBreaks: old: "method org.apache.iceberg.parquet.ParquetValueWriters.StructWriter\ \ org.apache.iceberg.data.parquet.GenericParquetWriter::createStructWriter(java.util.List>)" justification: "Removing deprecations for 1.10.0" - "1.10.0": - org.apache.iceberg:iceberg-api: - - code: "java.class.defaultSerializationChanged" - old: "class org.apache.iceberg.encryption.EncryptingFileIO" - new: "class org.apache.iceberg.encryption.EncryptingFileIO" - justification: "New method for Manifest List reading" - org.apache.iceberg:iceberg-core: - - code: "java.class.noLongerInheritsFromClass" - old: "class org.apache.iceberg.rest.auth.OAuth2Manager" - new: "class org.apache.iceberg.rest.auth.OAuth2Manager" - justification: "Removing deprecations for 1.11.0" - - code: "java.class.nowImplementsInterface" - old: "class org.apache.iceberg.rest.auth.OAuth2Manager" - new: "class org.apache.iceberg.rest.auth.OAuth2Manager" - justification: "Removing deprecations for 1.11.0" - - code: "java.class.removed" - old: "class org.apache.iceberg.rest.auth.RefreshingAuthManager" - justification: "Removing deprecations for 1.11.0" - - code: "java.field.constantValueChanged" - old: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN" - new: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN" - justification: "Plan API is table scoped and path constant value should include namespace. No actual breakage because it never worked before with incorrect value." - - code: "java.field.constantValueChanged" - old: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN_SUBMIT" - new: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN_SUBMIT" - justification: "Plan API is table scoped and path constant value should include namespace. No actual breakage because it never worked before with incorrect value." - - code: "java.field.constantValueChanged" - old: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN_TASKS" - new: "field org.apache.iceberg.rest.ResourcePaths.V1_TABLE_SCAN_PLAN_TASKS" - justification: "Plan API is table scoped and path constant value should include namespace. No actual breakage because it never worked before with incorrect value." - - code: "java.class.removed" - old: "class org.apache.iceberg.PartitionStatsUtil" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method java.lang.String org.apache.iceberg.RewriteTablePathUtil::stagingPath(java.lang.String,\ - \ java.lang.String)" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.RewriteTablePathUtil.RewriteResult\ - \ org.apache.iceberg.RewriteTablePathUtil::rewriteDataManifest(org.apache.iceberg.ManifestFile,\ - \ org.apache.iceberg.io.OutputFile, org.apache.iceberg.io.FileIO, int, java.util.Map, java.lang.String, java.lang.String) throws\ - \ java.io.IOException" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.RewriteTablePathUtil.RewriteResult\ - \ org.apache.iceberg.RewriteTablePathUtil::rewriteDeleteManifest(org.apache.iceberg.ManifestFile,\ - \ org.apache.iceberg.io.OutputFile, org.apache.iceberg.io.FileIO, int, java.util.Map, java.lang.String, java.lang.String, java.lang.String)\ - \ throws java.io.IOException" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.Schema org.apache.iceberg.PartitionStatsHandler::schema(org.apache.iceberg.types.Types.StructType)" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.TableMetadata org.apache.iceberg.TableMetadataParser::read(org.apache.iceberg.io.FileIO,\ - \ org.apache.iceberg.io.InputFile)" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.encryption.EncryptionManager org.apache.iceberg.encryption.EncryptionUtil::createEncryptionManager(java.util.Map, org.apache.iceberg.encryption.KeyManagementClient)" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.rest.responses.OAuthTokenResponse org.apache.iceberg.rest.auth.OAuth2Util::exchangeToken(org.apache.iceberg.rest.RESTClient,\ - \ java.util.Map, java.lang.String, java.lang.String,\ - \ java.lang.String, java.lang.String, java.lang.String)" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.rest.responses.OAuthTokenResponse org.apache.iceberg.rest.auth.OAuth2Util::exchangeToken(org.apache.iceberg.rest.RESTClient,\ - \ java.util.Map, java.lang.String, java.lang.String,\ - \ java.lang.String, java.lang.String, java.lang.String, java.lang.String)" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.rest.responses.OAuthTokenResponse org.apache.iceberg.rest.auth.OAuth2Util::fetchToken(org.apache.iceberg.rest.RESTClient,\ - \ java.util.Map, java.lang.String, java.lang.String)" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.removed" - old: "method org.apache.iceberg.rest.responses.OAuthTokenResponse org.apache.iceberg.rest.auth.OAuth2Util::fetchToken(org.apache.iceberg.rest.RESTClient,\ - \ java.util.Map, java.lang.String, java.lang.String,\ - \ java.lang.String)" - justification: "Removing deprecated code for 1.11.0" - - code: "java.method.visibilityReduced" - old: "method void org.apache.iceberg.PartitionStats::liveEntry(org.apache.iceberg.ContentFile, org.apache.iceberg.Snapshot)" - new: "method void org.apache.iceberg.PartitionStats::liveEntry(org.apache.iceberg.ContentFile, org.apache.iceberg.Snapshot)" - justification: "Changing deprecated code" - - code: "java.method.visibilityReduced" - old: "method void org.apache.iceberg.PartitionStats::appendStats(org.apache.iceberg.PartitionStats)" - new: "method void org.apache.iceberg.PartitionStats::appendStats(org.apache.iceberg.PartitionStats)" - justification: "Changing deprecated code" - - code: "java.method.visibilityReduced" - old: "method void org.apache.iceberg.PartitionStats::deletedEntry(org.apache.iceberg.Snapshot)" - new: "method void org.apache.iceberg.PartitionStats::deletedEntry(org.apache.iceberg.Snapshot)" - justification: "Changing deprecated code" - org.apache.iceberg:iceberg-data: - - code: "java.class.removed" - old: "class org.apache.iceberg.data.PartitionStatsHandler" - justification: "Removing deprecated code for 1.11.0" apache-iceberg-0.14.0: org.apache.iceberg:iceberg-api: - code: "java.class.defaultSerializationChanged" diff --git a/api/src/main/java/org/apache/iceberg/UpdateSchema.java b/api/src/main/java/org/apache/iceberg/UpdateSchema.java index cbcaa0ee2365..1401b7d1a7cf 100644 --- a/api/src/main/java/org/apache/iceberg/UpdateSchema.java +++ b/api/src/main/java/org/apache/iceberg/UpdateSchema.java @@ -564,6 +564,20 @@ default UpdateSchema updateColumnDefault(String name, Literal newDefault) { */ UpdateSchema deleteColumn(String name); + /** + * Restore a previously deleted column from the schema history. + * + *

The name is used to search for the column in historical schemas. The column is restored with + * its original field ID, preserving data file compatibility. Restored columns are always + * optional. + * + * @param name name of the column to restore (supports dot notation for nested fields) + * @return this for method chaining + * @throws IllegalArgumentException if name already exists, was never deleted, or parent struct + * does not exist + */ + UpdateSchema undeleteColumn(String name); + /** * Move a column from its current position to the start of the schema or its parent struct. * diff --git a/core/src/main/java/org/apache/iceberg/SchemaUpdate.java b/core/src/main/java/org/apache/iceberg/SchemaUpdate.java index e42df2fe5ed3..b3a5b7348b3f 100644 --- a/core/src/main/java/org/apache/iceberg/SchemaUpdate.java +++ b/core/src/main/java/org/apache/iceberg/SchemaUpdate.java @@ -200,6 +200,109 @@ public UpdateSchema deleteColumn(String name) { return this; } + @Override + public UpdateSchema undeleteColumn(String name) { + Types.NestedField existingField = findField(name); + Preconditions.checkArgument( + existingField == null, + "Cannot undelete column '%s': a column with this name already exists in the current schema", + name); + + Preconditions.checkArgument( + base != null, + "Cannot undelete column: table metadata is required to access historical schemas"); + + DeletedColumnInfo deletedInfo = findDeletedColumn(name); + Preconditions.checkArgument( + deletedInfo != null, + "Cannot undelete column '%s': column not found in any historical schema", + name); + + int parentId = deletedInfo.parentId(); + Types.NestedField originalField = deletedInfo.field(); + + // undeleted columns are always optional since new data may not have values + Types.NestedField field = + Types.NestedField.optional( + originalField.fieldId(), + originalField.name(), + originalField.type(), + originalField.doc()); + + if (parentId != TABLE_ROOT_ID) { + idToParent.put(field.fieldId(), parentId); + } + + updates.put(field.fieldId(), field); + parentToAddedIds.put(parentId, field.fieldId()); + addedNameToId.put(name, field.fieldId()); + + return this; + } + + private record DeletedColumnInfo(int parentId, Types.NestedField field) {} + + /** Find the first instance of the deleted column, from most recent to oldest. */ + private DeletedColumnInfo findDeletedColumn(String name) { + List schemas = base.schemas(); + + String[] parts = name.split("\\."); + String parentPath = + parts.length > 1 + ? String.join(".", java.util.Arrays.copyOf(parts, parts.length - 1)) + : null; + + if (parentPath != null) { + Types.NestedField currentParent = findField(parentPath); + Preconditions.checkArgument( + currentParent != null, + "Cannot undelete nested column '%s': parent struct '%s' does not exist in current schema. " + + "Undelete the parent first.", + name, + parentPath); + } + + for (int i = schemas.size() - 1; i >= 0; i--) { + Schema historicalSchema = schemas.get(i); + + Types.NestedField field = + caseSensitive + ? historicalSchema.findField(name) + : historicalSchema.caseInsensitiveFindField(name); + + if (field != null) { + int parentId; + if (parentPath != null) { + Types.NestedField parentField = + caseSensitive + ? historicalSchema.findField(parentPath) + : historicalSchema.caseInsensitiveFindField(parentPath); + + if (parentField == null) { + continue; + } + + Type parentType = parentField.type(); + if (parentType.isNestedType()) { + Type.NestedType nested = parentType.asNestedType(); + if (nested.isMapType()) { + parentField = nested.asMapType().fields().get(1); + } else if (nested.isListType()) { + parentField = nested.asListType().fields().get(0); + } + } + parentId = parentField.fieldId(); + } else { + parentId = TABLE_ROOT_ID; + } + + return new DeletedColumnInfo(parentId, field); + } + } + + return null; + } + @Override public UpdateSchema renameColumn(String name, String newName) { Types.NestedField field = findField(name); @@ -561,7 +664,6 @@ private static Schema applyChanges( } } - // apply schema changes Types.StructType struct = TypeUtil.visit(schema, new ApplyChanges(deletes, updates, parentToAddedIds, moves)) .asNestedType() diff --git a/core/src/test/java/org/apache/iceberg/hadoop/TestSchemaUndelete.java b/core/src/test/java/org/apache/iceberg/hadoop/TestSchemaUndelete.java new file mode 100644 index 000000000000..c33ddb52fc8a --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/hadoop/TestSchemaUndelete.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.hadoop; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestSchemaUndelete extends HadoopTableTestBase { + + @Test + public void testUndeleteTopLevelColumn() { + // Add a column, then delete it, then undelete it + table.updateSchema().addColumn("count", Types.LongType.get(), "a count column").commit(); + + int originalFieldId = table.schema().findField("count").fieldId(); + assertThat(table.schema().findField("count")).isNotNull(); + + // Delete the column + table.updateSchema().deleteColumn("count").commit(); + assertThat(table.schema().findField("count")).isNull(); + + // Undelete the column + table.updateSchema().undeleteColumn("count").commit(); + + Types.NestedField restoredField = table.schema().findField("count"); + assertThat(restoredField).isNotNull(); + assertThat(restoredField.fieldId()).isEqualTo(originalFieldId); + assertThat(restoredField.type()).isEqualTo(Types.LongType.get()); + assertThat(restoredField.doc()).isEqualTo("a count column"); + } + + @Test + public void testUndeleteNestedField() { + // Add a struct with nested fields + table + .updateSchema() + .addColumn( + "location", + Types.StructType.of( + Types.NestedField.optional(100, "lat", Types.DoubleType.get()), + Types.NestedField.optional(101, "long", Types.DoubleType.get()))) + .commit(); + + int latFieldId = table.schema().findField("location.lat").fieldId(); + assertThat(table.schema().findField("location.lat")).isNotNull(); + + // Delete the nested field + table.updateSchema().deleteColumn("location.lat").commit(); + assertThat(table.schema().findField("location.lat")).isNull(); + + // Undelete the nested field + table.updateSchema().undeleteColumn("location.lat").commit(); + + Types.NestedField restoredField = table.schema().findField("location.lat"); + assertThat(restoredField).isNotNull(); + assertThat(restoredField.fieldId()).isEqualTo(latFieldId); + assertThat(restoredField.type()).isEqualTo(Types.DoubleType.get()); + } + + @Test + public void testUndeleteColumnAlreadyExists() { + // Try to undelete a column that already exists (id is part of SCHEMA) + assertThatThrownBy(() -> table.updateSchema().undeleteColumn("id").commit()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("already exists in the current schema"); + } + + @Test + public void testUndeleteColumnNotFound() { + // Try to undelete a column that was never in the schema + assertThatThrownBy(() -> table.updateSchema().undeleteColumn("nonexistent_column").commit()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("not found in any historical schema"); + } + + @Test + public void testUndeletePreservesFieldId() { + // This test explicitly verifies that the field ID is preserved (not a new ID) + table.updateSchema().addColumn("temp_col", Types.StringType.get()).commit(); + + int originalId = table.schema().findField("temp_col").fieldId(); + + // Add another column to increment the lastColumnId + table.updateSchema().addColumn("another_col", Types.IntegerType.get()).commit(); + int lastIdAfterAdd = table.schema().findField("another_col").fieldId(); + + // Delete temp_col + table.updateSchema().deleteColumn("temp_col").commit(); + + // Undelete temp_col + table.updateSchema().undeleteColumn("temp_col").commit(); + + Types.NestedField restored = table.schema().findField("temp_col"); + assertThat(restored.fieldId()) + .as("Restored field should have original ID, not a new one") + .isEqualTo(originalId); + assertThat(restored.fieldId()) + .as("Restored field ID should be less than the last assigned ID") + .isLessThan(lastIdAfterAdd); + } + + @Test + public void testUndeleteNestedFieldParentMissing() { + // Add a struct, delete the whole struct, then try to undelete a nested field + table + .updateSchema() + .addColumn( + "prefs", + Types.StructType.of( + Types.NestedField.optional(200, "setting1", Types.BooleanType.get()), + Types.NestedField.optional(201, "setting2", Types.BooleanType.get()))) + .commit(); + + // Delete the entire parent struct + table.updateSchema().deleteColumn("prefs").commit(); + assertThat(table.schema().findField("prefs")).isNull(); + + // Try to undelete nested field when parent doesn't exist + assertThatThrownBy(() -> table.updateSchema().undeleteColumn("prefs.setting1").commit()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("parent struct") + .hasMessageContaining("does not exist") + .hasMessageContaining("Undelete the parent first"); + } + + @Test + public void testUndeleteParentThenNestedField() { + // Add a struct, delete the whole struct, then undelete parent, then undelete nested field + table + .updateSchema() + .addColumn( + "config", + Types.StructType.of( + Types.NestedField.optional(300, "enabled", Types.BooleanType.get()), + Types.NestedField.optional(301, "value", Types.StringType.get()))) + .commit(); + + int enabledId = table.schema().findField("config.enabled").fieldId(); + int configId = table.schema().findField("config").fieldId(); + + // Delete both nested fields to empty the struct, then delete the struct + table.updateSchema().deleteColumn("config.enabled").deleteColumn("config.value").commit(); + table.updateSchema().deleteColumn("config").commit(); + + assertThat(table.schema().findField("config")).isNull(); + + // Undelete the parent struct first + table.updateSchema().undeleteColumn("config").commit(); + + Types.NestedField restoredConfig = table.schema().findField("config"); + assertThat(restoredConfig).isNotNull(); + assertThat(restoredConfig.fieldId()).isEqualTo(configId); + + // Now undelete the nested field + table.updateSchema().undeleteColumn("config.enabled").commit(); + + Types.NestedField restoredEnabled = table.schema().findField("config.enabled"); + assertThat(restoredEnabled).isNotNull(); + assertThat(restoredEnabled.fieldId()).isEqualTo(enabledId); + } + + @Test + public void testUndeleteRequiredColumnBecomesOptional() { + // Add a required column, delete it, then undelete it + // The undeleted column should be optional because new data may have been written + // without this column after it was deleted + table + .updateSchema() + .allowIncompatibleChanges() + .addRequiredColumn("required_col", Types.StringType.get()) + .commit(); + + Types.NestedField originalField = table.schema().findField("required_col"); + assertThat(originalField.isRequired()).isTrue(); + int originalFieldId = originalField.fieldId(); + + // Delete the required column + table.updateSchema().deleteColumn("required_col").commit(); + assertThat(table.schema().findField("required_col")).isNull(); + + // Undelete the column - it should now be optional + table.updateSchema().undeleteColumn("required_col").commit(); + + Types.NestedField restoredField = table.schema().findField("required_col"); + assertThat(restoredField).isNotNull(); + assertThat(restoredField.fieldId()).isEqualTo(originalFieldId); + assertThat(restoredField.isOptional()) + .as( + "Undeleted column must be optional (not required) because new data may have been " + + "written without this column") + .isTrue(); + } + + @Test + public void testUndeleteCaseInsensitive() { + // Add and delete a column + table.updateSchema().addColumn("MixedCase", Types.StringType.get()).commit(); + int originalId = table.schema().findField("MixedCase").fieldId(); + table.updateSchema().deleteColumn("MixedCase").commit(); + + // Undelete with different case (case insensitive mode) + table.updateSchema().caseSensitive(false).undeleteColumn("mixedcase").commit(); + + Types.NestedField restored = table.schema().findField("MixedCase"); + assertThat(restored).isNotNull(); + assertThat(restored.fieldId()).isEqualTo(originalId); + } + + @Test + public void testUndeleteRestoresMostRecentlyDeletedField() { + // Add a column, delete it, add it again (new ID), delete it again + // Undelete should restore the most recently deleted field (second one) + table.updateSchema().addColumn("reused_name", Types.StringType.get()).commit(); + int firstFieldId = table.schema().findField("reused_name").fieldId(); + + // Delete the first field + table.updateSchema().deleteColumn("reused_name").commit(); + assertThat(table.schema().findField("reused_name")).isNull(); + + // Add a new field with the same name (will get a new ID) + table.updateSchema().addColumn("reused_name", Types.IntegerType.get()).commit(); + int secondFieldId = table.schema().findField("reused_name").fieldId(); + assertThat(secondFieldId) + .as("Second field should have a different ID than the first") + .isNotEqualTo(firstFieldId); + + // Delete the second field + table.updateSchema().deleteColumn("reused_name").commit(); + assertThat(table.schema().findField("reused_name")).isNull(); + + // Undelete - should restore the most recently deleted field (the second one) + table.updateSchema().undeleteColumn("reused_name").commit(); + + Types.NestedField restored = table.schema().findField("reused_name"); + assertThat(restored).isNotNull(); + assertThat(restored.fieldId()) + .as("Undelete should restore the most recently deleted field, not the first") + .isEqualTo(secondFieldId); + assertThat(restored.type()) + .as("Restored field should have the type of the second field") + .isEqualTo(Types.IntegerType.get()); + } +} diff --git a/docs/docs/spark-procedures.md b/docs/docs/spark-procedures.md index 7f211d9f260b..c34dbcb1dc96 100644 --- a/docs/docs/spark-procedures.md +++ b/docs/docs/spark-procedures.md @@ -245,6 +245,52 @@ Fast-forward the main branch to the head of `audit-branch` CALL catalog_name.system.fast_forward('my_table', 'main', 'audit-branch'); ``` +## Schema management + +### `undelete_column` + +Restores a previously deleted column from the table's schema history. + +The column is restored with its original field ID, preserving data file compatibility. This allows you to recover columns that were accidentally deleted without losing access to existing data. + +!!! info + Restored columns are always made optional, even if the original column was required. This is because new data may have been written without this column after it was deleted. + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to update | +| `column` | ✔️ | string | Name of the column to restore (use dotted notation for nested fields, e.g., `struct.field`) | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `column_name` | string | The name of the restored column | +| `field_id` | int | The field ID of the restored column | +| `type` | string | The type of the restored column | + +#### Examples + +Restore a deleted top-level column `count` in table `db.sample`: +```sql +CALL catalog_name.system.undelete_column('db.sample', 'count'); +``` + +Restore a deleted nested field `location.lat` in table `db.sample`: +```sql +CALL catalog_name.system.undelete_column('db.sample', 'location.lat'); +``` + +Restore a column using named arguments: +```sql +CALL catalog_name.system.undelete_column(table => 'db.sample', column => 'deleted_col'); +``` + +!!! warning + If you want to undelete a nested field whose parent struct was also deleted, you must first undelete the parent struct, then undelete the nested field. + ## Metadata management Many [maintenance actions](maintenance.md) can be performed using Iceberg stored procedures. diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java index 82f44996c8e1..af7bad32569b 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java @@ -64,6 +64,7 @@ private static Map> initProcedureBuilders() { mapBuilder.put("compute_table_stats", ComputeTableStatsProcedure::builder); mapBuilder.put("compute_partition_stats", ComputePartitionStatsProcedure::builder); mapBuilder.put("rewrite_table_path", RewriteTablePathProcedure::builder); + mapBuilder.put("undelete_column", UndeleteColumnProcedure::builder); return mapBuilder.build(); } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java new file mode 100644 index 000000000000..f0b52f002242 --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.procedures; + +import org.apache.iceberg.spark.procedures.SparkProcedures.ProcedureBuilder; +import org.apache.iceberg.types.Types; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.connector.catalog.Identifier; +import org.apache.spark.sql.connector.catalog.TableCatalog; +import org.apache.spark.sql.connector.iceberg.catalog.ProcedureParameter; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; + +/** + * A procedure that restores a previously deleted column from the schema history. + * + *

The column is restored with its original field ID, preserving data file compatibility. + * + * @see org.apache.iceberg.UpdateSchema#undeleteColumn(String) + */ +class UndeleteColumnProcedure extends BaseProcedure { + + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] { + ProcedureParameter.required("table", DataTypes.StringType), + ProcedureParameter.required("column", DataTypes.StringType) + }; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("column_name", DataTypes.StringType, false, Metadata.empty()), + new StructField("field_id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("type", DataTypes.StringType, false, Metadata.empty()) + }); + + public static ProcedureBuilder builder() { + return new BaseProcedure.Builder() { + @Override + protected UndeleteColumnProcedure doBuild() { + return new UndeleteColumnProcedure(tableCatalog()); + } + }; + } + + private UndeleteColumnProcedure(TableCatalog tableCatalog) { + super(tableCatalog); + } + + @Override + public ProcedureParameter[] parameters() { + return PARAMETERS; + } + + @Override + public StructType outputType() { + return OUTPUT_TYPE; + } + + @Override + public InternalRow[] call(InternalRow args) { + Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name()); + String columnName = args.getString(1); + + return modifyIcebergTable( + tableIdent, + table -> { + table.updateSchema().undeleteColumn(columnName).commit(); + + // Get the restored field info + Types.NestedField restoredField = table.schema().findField(columnName); + + return new InternalRow[] { + newInternalRow( + UTF8String.fromString(columnName), + restoredField.fieldId(), + UTF8String.fromString(restoredField.type().toString())) + }; + }); + } + + @Override + public String description() { + return "UndeleteColumnProcedure"; + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java index 6b42a04421dc..58c54d46a194 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java @@ -60,6 +60,7 @@ private static Map> initProcedureBuilders() { mapBuilder.put(ComputeTableStatsProcedure.NAME, ComputeTableStatsProcedure::builder); mapBuilder.put(ComputePartitionStatsProcedure.NAME, ComputePartitionStatsProcedure::builder); mapBuilder.put(RewriteTablePathProcedure.NAME, RewriteTablePathProcedure::builder); + mapBuilder.put(UndeleteColumnProcedure.NAME, UndeleteColumnProcedure::builder); return mapBuilder.build(); } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java new file mode 100644 index 000000000000..69ef84fb1c84 --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.procedures; + +import java.util.Iterator; +import org.apache.iceberg.spark.procedures.SparkProcedures.ProcedureBuilder; +import org.apache.iceberg.types.Types; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.connector.catalog.Identifier; +import org.apache.spark.sql.connector.catalog.TableCatalog; +import org.apache.spark.sql.connector.catalog.procedures.BoundProcedure; +import org.apache.spark.sql.connector.catalog.procedures.ProcedureParameter; +import org.apache.spark.sql.connector.read.Scan; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; + +/** + * A procedure that restores a previously deleted column from the schema history. + * + *

The column is restored with its original field ID, preserving data file compatibility. + * + * @see org.apache.iceberg.UpdateSchema#undeleteColumn(String) + */ +class UndeleteColumnProcedure extends BaseProcedure { + + static final String NAME = "undelete_column"; + + private static final ProcedureParameter TABLE_PARAM = + requiredInParameter("table", DataTypes.StringType); + private static final ProcedureParameter COLUMN_PARAM = + requiredInParameter("column", DataTypes.StringType); + + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] {TABLE_PARAM, COLUMN_PARAM}; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("column_name", DataTypes.StringType, false, Metadata.empty()), + new StructField("field_id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("type", DataTypes.StringType, false, Metadata.empty()) + }); + + public static ProcedureBuilder builder() { + return new BaseProcedure.Builder() { + @Override + protected UndeleteColumnProcedure doBuild() { + return new UndeleteColumnProcedure(tableCatalog()); + } + }; + } + + private UndeleteColumnProcedure(TableCatalog tableCatalog) { + super(tableCatalog); + } + + @Override + public BoundProcedure bind(StructType inputType) { + return this; + } + + @Override + public ProcedureParameter[] parameters() { + return PARAMETERS; + } + + @Override + public Iterator call(InternalRow args) { + ProcedureInput input = new ProcedureInput(spark(), tableCatalog(), PARAMETERS, args); + Identifier tableIdent = input.ident(TABLE_PARAM); + String columnName = input.asString(COLUMN_PARAM, null); + + return modifyIcebergTable( + tableIdent, + table -> { + table.updateSchema().undeleteColumn(columnName).commit(); + + // Get the restored field info + Types.NestedField restoredField = table.schema().findField(columnName); + + InternalRow outputRow = + newInternalRow( + UTF8String.fromString(columnName), + restoredField.fieldId(), + UTF8String.fromString(restoredField.type().toString())); + return asScanIterator(OUTPUT_TYPE, outputRow); + }); + } + + @Override + public String name() { + return NAME; + } + + @Override + public String description() { + return "UndeleteColumnProcedure"; + } +} diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java index bad31a12c19a..c7d0b3f36fd3 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/procedures/SparkProcedures.java @@ -65,6 +65,7 @@ private static Map> initProcedureBuilders() { mapBuilder.put(ComputeTableStatsProcedure.NAME, ComputeTableStatsProcedure::builder); mapBuilder.put(ComputePartitionStatsProcedure.NAME, ComputePartitionStatsProcedure::builder); mapBuilder.put(RewriteTablePathProcedure.NAME, RewriteTablePathProcedure::builder); + mapBuilder.put(UndeleteColumnProcedure.NAME, UndeleteColumnProcedure::builder); return mapBuilder.build(); } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java new file mode 100644 index 000000000000..69ef84fb1c84 --- /dev/null +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/procedures/UndeleteColumnProcedure.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.procedures; + +import java.util.Iterator; +import org.apache.iceberg.spark.procedures.SparkProcedures.ProcedureBuilder; +import org.apache.iceberg.types.Types; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.connector.catalog.Identifier; +import org.apache.spark.sql.connector.catalog.TableCatalog; +import org.apache.spark.sql.connector.catalog.procedures.BoundProcedure; +import org.apache.spark.sql.connector.catalog.procedures.ProcedureParameter; +import org.apache.spark.sql.connector.read.Scan; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.UTF8String; + +/** + * A procedure that restores a previously deleted column from the schema history. + * + *

The column is restored with its original field ID, preserving data file compatibility. + * + * @see org.apache.iceberg.UpdateSchema#undeleteColumn(String) + */ +class UndeleteColumnProcedure extends BaseProcedure { + + static final String NAME = "undelete_column"; + + private static final ProcedureParameter TABLE_PARAM = + requiredInParameter("table", DataTypes.StringType); + private static final ProcedureParameter COLUMN_PARAM = + requiredInParameter("column", DataTypes.StringType); + + private static final ProcedureParameter[] PARAMETERS = + new ProcedureParameter[] {TABLE_PARAM, COLUMN_PARAM}; + + private static final StructType OUTPUT_TYPE = + new StructType( + new StructField[] { + new StructField("column_name", DataTypes.StringType, false, Metadata.empty()), + new StructField("field_id", DataTypes.IntegerType, false, Metadata.empty()), + new StructField("type", DataTypes.StringType, false, Metadata.empty()) + }); + + public static ProcedureBuilder builder() { + return new BaseProcedure.Builder() { + @Override + protected UndeleteColumnProcedure doBuild() { + return new UndeleteColumnProcedure(tableCatalog()); + } + }; + } + + private UndeleteColumnProcedure(TableCatalog tableCatalog) { + super(tableCatalog); + } + + @Override + public BoundProcedure bind(StructType inputType) { + return this; + } + + @Override + public ProcedureParameter[] parameters() { + return PARAMETERS; + } + + @Override + public Iterator call(InternalRow args) { + ProcedureInput input = new ProcedureInput(spark(), tableCatalog(), PARAMETERS, args); + Identifier tableIdent = input.ident(TABLE_PARAM); + String columnName = input.asString(COLUMN_PARAM, null); + + return modifyIcebergTable( + tableIdent, + table -> { + table.updateSchema().undeleteColumn(columnName).commit(); + + // Get the restored field info + Types.NestedField restoredField = table.schema().findField(columnName); + + InternalRow outputRow = + newInternalRow( + UTF8String.fromString(columnName), + restoredField.fieldId(), + UTF8String.fromString(restoredField.type().toString())); + return asScanIterator(OUTPUT_TYPE, outputRow); + }); + } + + @Override + public String name() { + return NAME; + } + + @Override + public String description() { + return "UndeleteColumnProcedure"; + } +}