From 747813bbbd43fdab35ab679945901eec11c8745b Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Wed, 21 Jan 2026 16:25:34 +0200 Subject: [PATCH 1/9] Implement fragment writer which proceeds one column at a time --- tiledb/CMakeLists.txt | 1 + tiledb/sm/fragment/CMakeLists.txt | 1 + tiledb/sm/fragment/column_fragment_writer.cc | 337 +++++++++++++++++++ tiledb/sm/fragment/column_fragment_writer.h | 211 ++++++++++++ 4 files changed, 550 insertions(+) create mode 100644 tiledb/sm/fragment/column_fragment_writer.cc create mode 100644 tiledb/sm/fragment/column_fragment_writer.h diff --git a/tiledb/CMakeLists.txt b/tiledb/CMakeLists.txt index ffc93da1d8d..011b2dfc95c 100644 --- a/tiledb/CMakeLists.txt +++ b/tiledb/CMakeLists.txt @@ -215,6 +215,7 @@ set(TILEDB_CORE_SOURCES ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filter/webp_filter.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filter/noop_filter.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/filter/positive_delta_filter.cc + ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/fragment/column_fragment_writer.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/fragment/fragment_identifier.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/fragment/fragment_info.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/fragment/fragment_metadata.cc diff --git a/tiledb/sm/fragment/CMakeLists.txt b/tiledb/sm/fragment/CMakeLists.txt index 0c0f337e4a7..89ae3cf94ca 100644 --- a/tiledb/sm/fragment/CMakeLists.txt +++ b/tiledb/sm/fragment/CMakeLists.txt @@ -50,6 +50,7 @@ conclude(object_library) # `fragment_metadata` object library # list(APPEND SOURCES + column_fragment_writer.cc fragment_metadata.cc fragment_info.cc loaded_fragment_metadata.cc diff --git a/tiledb/sm/fragment/column_fragment_writer.cc b/tiledb/sm/fragment/column_fragment_writer.cc new file mode 100644 index 00000000000..3dbd2c722b5 --- /dev/null +++ b/tiledb/sm/fragment/column_fragment_writer.cc @@ -0,0 +1,337 @@ +/** + * @file column_fragment_writer.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2026 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file implements class ColumnFragmentWriter. + */ + +#include "tiledb/sm/fragment/column_fragment_writer.h" + +#include "tiledb/sm/array_schema/array_schema.h" +#include "tiledb/sm/crypto/encryption_key.h" +#include "tiledb/sm/filesystem/vfs.h" +#include "tiledb/sm/fragment/fragment_identifier.h" +#include "tiledb/sm/fragment/fragment_metadata.h" +#include "tiledb/sm/misc/constants.h" +#include "tiledb/sm/storage_manager/context_resources.h" +#include "tiledb/sm/tile/tile_metadata_generator.h" +#include "tiledb/sm/tile/writer_tile_tuple.h" + +using namespace tiledb::common; + +namespace tiledb::sm { + +/* ********************************* */ +/* CONSTRUCTORS & DESTRUCTORS */ +/* ********************************* */ + +ColumnFragmentWriter::ColumnFragmentWriter( + ContextResources* resources, + shared_ptr array_schema, + const URI& fragment_uri, + const NDRange& non_empty_domain, + uint64_t tile_count) + : resources_(resources) + , array_schema_(array_schema) + , fragment_uri_(fragment_uri) + , dense_(array_schema->dense()) + , current_tile_idx_(0) + , tile_num_(0) { + // For dense arrays, compute tile count from domain + if (dense_) { + tile_num_ = array_schema->domain().tile_num(non_empty_domain); + } else { + tile_num_ = tile_count; + } + + // Create fragment directory structure + create_fragment_directory(); + + // Derive timestamp range from fragment URI + FragmentID frag_id{fragment_uri}; + auto timestamp_range = frag_id.timestamp_range(); + + // Create memory tracker from resources + auto memory_tracker = resources->create_memory_tracker(); + + // Create fragment metadata + frag_meta_ = make_shared( + HERE(), + resources_, + array_schema_, + fragment_uri_, + timestamp_range, + memory_tracker, + dense_, + false, // has_timestamps + false); // has_delete_meta + + // Initialize metadata with domain and tile count + frag_meta_->init(non_empty_domain); + frag_meta_->set_num_tiles(tile_num_); +} + +/* ********************************* */ +/* FIELD OPERATIONS */ +/* ********************************* */ + +void ColumnFragmentWriter::open_field(const std::string& name) { + if (!current_field_.empty()) { + throw ColumnFragmentWriterException( + "Cannot open field '" + name + "': field '" + current_field_ + + "' is already open"); + } + + if (!array_schema_->is_field(name)) { + throw ColumnFragmentWriterException( + "Field '" + name + "' does not exist in array schema"); + } + + current_field_ = name; + current_tile_idx_ = 0; +} + +void ColumnFragmentWriter::write_tile(WriterTileTuple& tile) { + if (current_field_.empty()) { + throw ColumnFragmentWriterException( + "Cannot write tile: no field is currently open"); + } + + if (!tile.filtered_size().has_value()) { + throw ColumnFragmentWriterException( + "Cannot write tile: tile is not filtered"); + } + + if (current_tile_idx_ >= tile_num_) { + throw ColumnFragmentWriterException( + "Cannot write tile: tile count limit (" + std::to_string(tile_num_) + + ") reached"); + } + + const std::string& name = current_field_; + const bool var_size = array_schema_->var_size(name); + const bool nullable = array_schema_->is_nullable(name); + + const auto type = array_schema_->type(name); + const auto is_dim = array_schema_->is_dim(name); + const auto cell_val_num = array_schema_->cell_val_num(name); + const bool has_min_max_md = TileMetadataGenerator::has_min_max_metadata( + type, is_dim, var_size, cell_val_num); + const bool has_sum_md = + TileMetadataGenerator::has_sum_metadata(type, var_size, cell_val_num); + + // Get URIs for tile files + URI uri = frag_meta_->uri(name); + URI var_uri = var_size ? frag_meta_->var_uri(name) : URI(""); + URI validity_uri = nullable ? frag_meta_->validity_uri(name) : URI(""); + + // Write fixed/offset tile + auto& t = var_size ? tile.offset_tile() : tile.fixed_tile(); + resources_->vfs().write( + uri, t.filtered_buffer().data(), t.filtered_buffer().size()); + frag_meta_->set_tile_offset( + name, current_tile_idx_, t.filtered_buffer().size()); + + auto null_count = tile.null_count(); + auto cell_num = tile.cell_num(); + + // Write var tile if var-size + if (var_size) { + auto& t_var = tile.var_tile(); + resources_->vfs().write( + var_uri, + t_var.filtered_buffer().data(), + t_var.filtered_buffer().size()); + frag_meta_->set_tile_var_offset( + name, current_tile_idx_, t_var.filtered_buffer().size()); + frag_meta_->set_tile_var_size( + name, current_tile_idx_, tile.var_pre_filtered_size()); + + if (has_min_max_md && null_count != cell_num) { + frag_meta_->set_tile_min_var_size( + name, current_tile_idx_, tile.min().size()); + frag_meta_->set_tile_max_var_size( + name, current_tile_idx_, tile.max().size()); + } + } else { + if (has_min_max_md && null_count != cell_num) { + frag_meta_->set_tile_min(name, current_tile_idx_, tile.min()); + frag_meta_->set_tile_max(name, current_tile_idx_, tile.max()); + } + + if (has_sum_md) { + frag_meta_->set_tile_sum(name, current_tile_idx_, tile.sum()); + } + } + + // Write validity tile if nullable + if (nullable) { + auto& t_val = tile.validity_tile(); + resources_->vfs().write( + validity_uri, + t_val.filtered_buffer().data(), + t_val.filtered_buffer().size()); + frag_meta_->set_tile_validity_offset( + name, current_tile_idx_, t_val.filtered_buffer().size()); + frag_meta_->set_tile_null_count(name, current_tile_idx_, null_count); + } + + current_tile_idx_++; +} + +void ColumnFragmentWriter::close_field() { + if (current_field_.empty()) { + throw ColumnFragmentWriterException( + "Cannot close field: no field is currently open"); + } + + const std::string& name = current_field_; + const bool var_size = array_schema_->var_size(name); + const bool nullable = array_schema_->is_nullable(name); + + // Close the file URIs + URI uri = frag_meta_->uri(name); + throw_if_not_ok(resources_->vfs().close_file(uri)); + + if (var_size) { + URI var_uri = frag_meta_->var_uri(name); + throw_if_not_ok(resources_->vfs().close_file(var_uri)); + + // Convert min/max var sizes to offsets + const auto type = array_schema_->type(name); + const auto is_dim = array_schema_->is_dim(name); + const auto cell_val_num = array_schema_->cell_val_num(name); + if (TileMetadataGenerator::has_min_max_metadata( + type, is_dim, var_size, cell_val_num)) { + frag_meta_->convert_tile_min_max_var_sizes_to_offsets(name); + } + } + + if (nullable) { + URI validity_uri = frag_meta_->validity_uri(name); + throw_if_not_ok(resources_->vfs().close_file(validity_uri)); + } + + // Verify correct number of tiles were written + if (current_tile_idx_ != tile_num_) { + throw ColumnFragmentWriterException( + "Field '" + name + "' has " + std::to_string(current_tile_idx_) + + " tiles but expected " + std::to_string(tile_num_)); + } + + current_field_.clear(); + current_tile_idx_ = 0; +} + +/* ********************************* */ +/* FRAGMENT-LEVEL OPERATIONS */ +/* ********************************* */ + +void ColumnFragmentWriter::finalize(const EncryptionKey& encryption_key) { + if (!current_field_.empty()) { + throw ColumnFragmentWriterException( + "Cannot finalize: field '" + current_field_ + "' is still open"); + } + + if (!dense_) { + throw ColumnFragmentWriterException( + "Cannot finalize sparse array without MBRs"); + } + + finalize_internal(encryption_key); +} + +void ColumnFragmentWriter::finalize( + const EncryptionKey& encryption_key, const std::vector& mbrs) { + if (!current_field_.empty()) { + throw ColumnFragmentWriterException( + "Cannot finalize: field '" + current_field_ + "' is still open"); + } + + if (dense_) { + throw ColumnFragmentWriterException("Dense arrays should not provide MBRs"); + } + + if (mbrs.size() != tile_num_) { + throw ColumnFragmentWriterException( + "Sparse array requires " + std::to_string(tile_num_) + + " MBRs but got " + std::to_string(mbrs.size())); + } + + for (uint64_t i = 0; i < mbrs.size(); ++i) { + frag_meta_->set_mbr(i, mbrs[i]); + } + + finalize_internal(encryption_key); +} + +void ColumnFragmentWriter::finalize_internal( + const EncryptionKey& encryption_key) { + frag_meta_->compute_fragment_min_max_sum_null_count(); + frag_meta_->store(encryption_key); + + // Create commit file + FragmentID frag_id{fragment_uri_}; + URI commit_uri; + if (frag_id.array_format_version() >= 12) { + commit_uri = array_schema_->array_uri() + .join_path(constants::array_commits_dir_name) + .join_path(frag_id.name() + constants::write_file_suffix); + } else { + commit_uri = URI(fragment_uri_.to_string() + constants::ok_file_suffix); + } + resources_->vfs().touch(commit_uri); +} + +/* ********************************* */ +/* ACCESSORS */ +/* ********************************* */ + +const URI& ColumnFragmentWriter::fragment_uri() const { + return fragment_uri_; +} + +shared_ptr ColumnFragmentWriter::fragment_metadata() const { + return frag_meta_; +} + +/* ********************************* */ +/* PRIVATE METHODS */ +/* ********************************* */ + +void ColumnFragmentWriter::create_fragment_directory() { + const URI& array_uri = array_schema_->array_uri(); + + resources_->vfs().create_dir( + array_uri.join_path(constants::array_fragments_dir_name)); + resources_->vfs().create_dir(fragment_uri_); + resources_->vfs().create_dir( + array_uri.join_path(constants::array_commits_dir_name)); +} + +} // namespace tiledb::sm diff --git a/tiledb/sm/fragment/column_fragment_writer.h b/tiledb/sm/fragment/column_fragment_writer.h new file mode 100644 index 00000000000..f76e2045ca9 --- /dev/null +++ b/tiledb/sm/fragment/column_fragment_writer.h @@ -0,0 +1,211 @@ +/** + * @file column_fragment_writer.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2026 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines class ColumnFragmentWriter, which writes one field + * (column) at a time to a fragment. + */ + +#ifndef TILEDB_COLUMN_FRAGMENT_WRITER_H +#define TILEDB_COLUMN_FRAGMENT_WRITER_H + +#include +#include + +#include "tiledb/common/common.h" +#include "tiledb/sm/filesystem/uri.h" +#include "tiledb/sm/misc/types.h" + +using namespace tiledb::common; + +namespace tiledb::sm { + +class ArraySchema; +class ContextResources; +class EncryptionKey; +class FragmentMetadata; +class WriterTileTuple; + +class ColumnFragmentWriterException : public StatusException { + public: + explicit ColumnFragmentWriterException(const std::string& message) + : StatusException("ColumnFragmentWriter", message) { + } +}; + +/** + * A fragment writer that writes one field (column) at a time. + * + * Usage: + * 1. Create a ColumnFragmentWriter with domain and tile count + * 2. For each field: + * a. Call open_field(name) + * b. Call write_tile() for each pre-filtered tile + * c. Call close_field() + * 3. Call finalize(key) for dense or finalize(key, mbrs) for sparse + */ +class ColumnFragmentWriter { + public: + /* ********************************* */ + /* CONSTRUCTORS & DESTRUCTORS */ + /* ********************************* */ + + /** + * Constructor. Creates fragment directory and FragmentMetadata. + * + * Derived automatically: + * - timestamp_range from fragment_uri + * - memory_tracker from resources + * - dense/sparse from array_schema + * - tile_count from non_empty_domain (dense only) + * + * @param resources A context resources instance. + * @param array_schema The schema of the array the fragment belongs to. + * @param fragment_uri The fragment URI (must contain valid timestamps). + * @param non_empty_domain The non-empty domain for this fragment. + * @param tile_count Number of tiles (required for sparse, ignored for dense). + */ + ColumnFragmentWriter( + ContextResources* resources, + shared_ptr array_schema, + const URI& fragment_uri, + const NDRange& non_empty_domain, + uint64_t tile_count = 0); + + ~ColumnFragmentWriter() = default; + + DISABLE_COPY_AND_COPY_ASSIGN(ColumnFragmentWriter); + DISABLE_MOVE_AND_MOVE_ASSIGN(ColumnFragmentWriter); + + /* ********************************* */ + /* FIELD OPERATIONS */ + /* ********************************* */ + + /** + * Opens a field for writing. Must be called before write_tile(). + * + * @param name The name of the field (attribute or dimension). + * @throws ColumnFragmentWriterException if field doesn't exist in schema, + * or if another field is already open. + */ + void open_field(const std::string& name); + + /** + * Writes a pre-filtered tile for the currently open field. + * + * @param tile The tile to write. Must already be filtered + * (tile.filtered_size().has_value() must be true). + * @throws ColumnFragmentWriterException if no field is open, + * tile is not filtered, or tile count limit reached. + */ + void write_tile(WriterTileTuple& tile); + + /** + * Closes the currently open field. Flushes file buffers. + * + * @throws ColumnFragmentWriterException if no field is open. + */ + void close_field(); + + /* ********************************* */ + /* FRAGMENT-LEVEL OPERATIONS */ + /* ********************************* */ + + /** + * Finalizes a dense fragment. Stores metadata and creates commit file. + * + * @param encryption_key The encryption key for storing metadata. + * @throws ColumnFragmentWriterException if a field is still open, + * or if this is a sparse array. + */ + void finalize(const EncryptionKey& encryption_key); + + /** + * Finalizes a sparse fragment. Stores metadata and creates commit file. + * + * @param encryption_key The encryption key for storing metadata. + * @param mbrs MBRs for sparse arrays (one per tile). + * @throws ColumnFragmentWriterException if a field is still open, + * if this is a dense array, or if MBR count doesn't match tile count. + */ + void finalize( + const EncryptionKey& encryption_key, const std::vector& mbrs); + + /* ********************************* */ + /* ACCESSORS */ + /* ********************************* */ + + /** Returns the fragment URI. */ + const URI& fragment_uri() const; + + /** Returns the fragment metadata. */ + shared_ptr fragment_metadata() const; + + private: + /* ********************************* */ + /* PRIVATE METHODS */ + /* ********************************* */ + + /** Creates the fragment directory structure. */ + void create_fragment_directory(); + + /** Internal finalize implementation. */ + void finalize_internal(const EncryptionKey& encryption_key); + + /* ********************************* */ + /* PRIVATE ATTRIBUTES */ + /* ********************************* */ + + /** The context resources. */ + ContextResources* resources_; + + /** The array schema. */ + shared_ptr array_schema_; + + /** The fragment URI. */ + URI fragment_uri_; + + /** The fragment metadata. */ + shared_ptr frag_meta_; + + /** Whether this is a dense fragment. */ + bool dense_; + + /** Currently open field name (empty if no field is open). */ + std::string current_field_; + + /** Current tile index for the open field. */ + uint64_t current_tile_idx_; + + /** Number of tiles to be written. */ + uint64_t tile_num_; +}; + +} // namespace tiledb::sm + +#endif // TILEDB_COLUMN_FRAGMENT_WRITER_H From 175e3c8d71ac78f06dbfa94c2d8b11b1845a17cd Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Wed, 21 Jan 2026 16:25:54 +0200 Subject: [PATCH 2/9] Add unit tests for `ColumnFragmentWriter` --- test/CMakeLists.txt | 1 + test/src/unit-column-fragment-writer.cc | 1011 +++++++++++++++++++++++ 2 files changed, 1012 insertions(+) create mode 100644 test/src/unit-column-fragment-writer.cc diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f7753d991d0..44eb72c37d2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -96,6 +96,7 @@ set(TILEDB_UNIT_TEST_SOURCES src/unit-capi-version.cc src/unit-capi-vfs.cc src/unit-CellSlabIter.cc + src/unit-column-fragment-writer.cc src/unit-compression-dd.cc src/unit-compression-delta.cc src/unit-compression-rle.cc diff --git a/test/src/unit-column-fragment-writer.cc b/test/src/unit-column-fragment-writer.cc new file mode 100644 index 00000000000..0711323683d --- /dev/null +++ b/test/src/unit-column-fragment-writer.cc @@ -0,0 +1,1011 @@ +/** + * @file unit-column-fragment-writer.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2026 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * Tests the ColumnFragmentWriter class. + */ + +#include +#include "test/support/src/helpers.h" +#include "test/support/src/mem_helpers.h" +#include "tiledb/api/c_api/array/array_api_internal.h" +#include "tiledb/api/c_api/context/context_api_internal.h" +#include "tiledb/sm/array_schema/array_schema.h" +#include "tiledb/sm/cpp_api/tiledb" +#include "tiledb/sm/crypto/encryption_key.h" +#include "tiledb/sm/filter/filter_pipeline.h" +#include "tiledb/sm/fragment/column_fragment_writer.h" +#include "tiledb/sm/misc/constants.h" +#include "tiledb/sm/storage_manager/context.h" +#include "tiledb/sm/tile/tile.h" +#include "tiledb/sm/tile/tile_metadata_generator.h" +#include "tiledb/sm/tile/writer_tile_tuple.h" +#include "tiledb/storage_format/uri/generate_uri.h" + +using namespace tiledb::sm; + +namespace { + +const std::string ARRAY_NAME = "column_fragment_writer_test_array"; + +/** + * Test fixture for ColumnFragmentWriter tests. + */ +struct ColumnFragmentWriterFx { + tiledb::Context ctx_; + tiledb::VFS vfs_; + + ColumnFragmentWriterFx() + : ctx_() + , vfs_(ctx_) { + if (vfs_.is_dir(ARRAY_NAME)) { + vfs_.remove_dir(ARRAY_NAME); + } + } + + ~ColumnFragmentWriterFx() { + if (vfs_.is_dir(ARRAY_NAME)) { + vfs_.remove_dir(ARRAY_NAME); + } + } + + /** + * Creates a simple dense array with a single int32 attribute. + */ + void create_dense_array() { + auto dim = tiledb::Dimension::create(ctx_, "d", {{0, 99}}, 10); + tiledb::Domain dom(ctx_); + dom.add_dimension(dim); + + auto attr = tiledb::Attribute::create(ctx_, "a"); + + tiledb::ArraySchema schema(ctx_, TILEDB_DENSE); + schema.set_domain(dom); + schema.add_attribute(attr); + schema.set_cell_order(TILEDB_ROW_MAJOR); + schema.set_tile_order(TILEDB_ROW_MAJOR); + + tiledb::Array::create(ARRAY_NAME, schema); + } + + /** + * Creates a sparse array with a single int32 dimension and attribute. + */ + void create_sparse_array() { + auto dim = tiledb::Dimension::create(ctx_, "d", {{0, 999}}, 100); + tiledb::Domain dom(ctx_); + dom.add_dimension(dim); + + auto attr = tiledb::Attribute::create(ctx_, "a"); + + tiledb::ArraySchema schema(ctx_, TILEDB_SPARSE); + schema.set_domain(dom); + schema.add_attribute(attr); + schema.set_capacity(10); + + tiledb::Array::create(ARRAY_NAME, schema); + } + + /** + * Creates a sparse array with a variable-size string attribute. + */ + void create_varsize_array() { + auto dim = tiledb::Dimension::create(ctx_, "d", {{0, 999}}, 100); + tiledb::Domain dom(ctx_); + dom.add_dimension(dim); + + auto attr = tiledb::Attribute::create(ctx_, "a"); + + tiledb::ArraySchema schema(ctx_, TILEDB_SPARSE); + schema.set_domain(dom); + schema.add_attribute(attr); + schema.set_capacity(10); + + tiledb::Array::create(ARRAY_NAME, schema); + } + + /** + * Creates a sparse array with a nullable int32 attribute. + */ + void create_nullable_array() { + auto dim = tiledb::Dimension::create(ctx_, "d", {{0, 999}}, 100); + tiledb::Domain dom(ctx_); + dom.add_dimension(dim); + + auto attr = tiledb::Attribute::create(ctx_, "a"); + attr.set_nullable(true); + + tiledb::ArraySchema schema(ctx_, TILEDB_SPARSE); + schema.set_domain(dom); + schema.add_attribute(attr); + schema.set_capacity(10); + + tiledb::Array::create(ARRAY_NAME, schema); + } + + /** + * Creates a dense array with multiple tiles (domain 0-99, tile extent 10). + */ + void create_multi_tile_dense_array() { + auto dim = tiledb::Dimension::create(ctx_, "d", {{0, 99}}, 10); + tiledb::Domain dom(ctx_); + dom.add_dimension(dim); + + auto attr = tiledb::Attribute::create(ctx_, "a"); + + tiledb::ArraySchema schema(ctx_, TILEDB_DENSE); + schema.set_domain(dom); + schema.add_attribute(attr); + schema.set_cell_order(TILEDB_ROW_MAJOR); + schema.set_tile_order(TILEDB_ROW_MAJOR); + + tiledb::Array::create(ARRAY_NAME, schema); + } + + /** + * Gets context resources from the C++ API context. + */ + ContextResources& get_resources() { + return ctx_.ptr().get()->context().resources(); + } + + /** + * Gets the array schema from an open array. + */ + shared_ptr get_array_schema() { + tiledb::Array array(ctx_, ARRAY_NAME, TILEDB_READ); + auto schema = array.ptr().get()->array()->array_schema_latest_ptr(); + array.close(); + return schema; + } + + /** + * Generates a fragment URI for testing. + */ + URI generate_fragment_uri(uint64_t timestamp = 1) const { + auto fragment_name = tiledb::storage_format::generate_timestamped_name( + timestamp, constants::format_version); + return URI(ARRAY_NAME) + .join_path(constants::array_fragments_dir_name) + .join_path(fragment_name); + } +}; + +/** + * Helper to filter a tile for testing purposes. + */ +void filter_tile_for_test( + const std::string& name, + WriterTileTuple& tile, + const ArraySchema& schema, + ContextResources& resources) { + const bool var_size = schema.var_size(name); + const bool nullable = schema.is_nullable(name); + EncryptionKey enc_key; + + auto filter_single_tile = [&](WriterTile* t, + WriterTile* offsets_tile, + bool is_offsets, + bool is_validity) { + FilterPipeline filters; + if (is_offsets) { + filters = schema.cell_var_offsets_filters(); + } else if (is_validity) { + filters = schema.cell_validity_filters(); + } else { + filters = schema.filters(name); + } + + if (is_offsets && + filters.skip_offsets_filtering(schema.type(name), schema.version())) { + t->filtered_buffer().expand(sizeof(uint64_t)); + uint64_t nchunks = 0; + memcpy(t->filtered_buffer().data(), &nchunks, sizeof(uint64_t)); + t->clear_data(); + return; + } + + throw_if_not_ok( + FilterPipeline::append_encryption_filter(&filters, enc_key)); + bool use_chunking = filters.use_tile_chunking( + schema.var_size(name), schema.version(), t->type()); + filters.run_forward( + &resources.stats(), + t, + offsets_tile, + &resources.compute_tp(), + use_chunking); + }; + + if (var_size) { + filter_single_tile(&tile.var_tile(), &tile.offset_tile(), false, false); + filter_single_tile(&tile.offset_tile(), nullptr, true, false); + } else { + filter_single_tile(&tile.fixed_tile(), nullptr, false, false); + } + + if (nullable) { + filter_single_tile(&tile.validity_tile(), nullptr, false, true); + } +} + +} // namespace + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: field lifecycle errors", + "[column-fragment-writer]") { + create_dense_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(); + + // Create domain for constructor + int32_t domain_start = 0; + int32_t domain_end = 9; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 0); // tile_count + + SECTION("Error: write_tile without open_field") { + auto memory_tracker = tiledb::test::get_test_memory_tracker(); + auto tile = WriterTileTuple( + *array_schema, + 10, + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + REQUIRE_THROWS_WITH( + writer.write_tile(tile), + Catch::Matchers::ContainsSubstring("no field is currently open")); + } + + SECTION("Error: close_field without open_field") { + REQUIRE_THROWS_WITH( + writer.close_field(), + Catch::Matchers::ContainsSubstring("no field is currently open")); + } + + SECTION("Error: open non-existent field") { + REQUIRE_THROWS_WITH( + writer.open_field("nonexistent"), + Catch::Matchers::ContainsSubstring("does not exist in array schema")); + } + + SECTION("Error: open field while another is open") { + writer.open_field("a"); + REQUIRE_THROWS_WITH( + writer.open_field("d"), + Catch::Matchers::ContainsSubstring("is already open")); + } + + SECTION("Error: finalize with field open") { + writer.open_field("a"); + EncryptionKey enc_key; + REQUIRE_THROWS_WITH( + writer.finalize(enc_key), + Catch::Matchers::ContainsSubstring("is still open")); + // Don't close - the writer will be destroyed with the field still open + } +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: write_tile validates input", + "[column-fragment-writer]") { + create_dense_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(); + + int32_t domain_start = 0; + int32_t domain_end = 9; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 1); // tile_count + + auto memory_tracker = tiledb::test::get_test_memory_tracker(); + auto tile = WriterTileTuple( + *array_schema, + 10, + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + writer.open_field("a"); + REQUIRE_THROWS(writer.write_tile(tile)); +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: accessors", + "[column-fragment-writer]") { + create_dense_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(); + + int32_t domain_start = 0; + int32_t domain_end = 9; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 0); // tile_count + + // Test accessors + CHECK(writer.fragment_uri() == fragment_uri); + CHECK(writer.fragment_metadata() != nullptr); +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: finalize with zero tiles (sparse)", + "[column-fragment-writer]") { + // Use sparse array - dense arrays require data for the entire domain + create_sparse_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(); + + int32_t domain_start = 0; + int32_t domain_end = 0; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 0); // tile_count + + // Open and close fields without writing any tiles + writer.open_field("d"); + writer.close_field(); + writer.open_field("a"); + writer.close_field(); + + // Finalize sparse array with empty MBRs + EncryptionKey enc_key; + std::vector empty_mbrs; + writer.finalize(enc_key, empty_mbrs); + + // Check that the commit file exists + URI array_uri(ARRAY_NAME); + URI commits_dir = array_uri.join_path(constants::array_commits_dir_name); + CHECK(vfs_.is_dir(commits_dir.to_string())); +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: finalize overload validation", + "[column-fragment-writer]") { + SECTION("Dense array cannot use finalize with MBRs") { + create_dense_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(); + + int32_t domain_start = 0; + int32_t domain_end = 9; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), array_schema, fragment_uri, non_empty_domain); + + // No need to write tiles - just test that finalize with MBRs throws + EncryptionKey enc_key; + std::vector mbrs; + REQUIRE_THROWS_WITH( + writer.finalize(enc_key, mbrs), + Catch::Matchers::ContainsSubstring( + "Dense arrays should not provide MBRs")); + } + + SECTION("Sparse array must use finalize with MBRs") { + create_sparse_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(); + + int32_t domain_start = 0; + int32_t domain_end = 0; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 0); // tile_count + + writer.open_field("d"); + writer.close_field(); + writer.open_field("a"); + writer.close_field(); + + EncryptionKey enc_key; + REQUIRE_THROWS_WITH( + writer.finalize(enc_key), + Catch::Matchers::ContainsSubstring( + "Cannot finalize sparse array without MBRs")); + } +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: verify standard API read roundtrip", + "[column-fragment-writer]") { + // First write data using the standard TileDB API + create_dense_array(); + + { + tiledb::Array array(ctx_, ARRAY_NAME, TILEDB_WRITE); + tiledb::Query query(ctx_, array, TILEDB_WRITE); + query.set_layout(TILEDB_ROW_MAJOR); + + std::vector data(100); + for (int i = 0; i < 100; i++) { + data[i] = i * 10; + } + query.set_data_buffer("a", data); + query.submit(); + array.close(); + } + + // Read back and verify using standard API + // This confirms that fragments written by standard writers can be read, + // which validates the patterns we're using in ColumnFragmentWriter + { + tiledb::Array read_array(ctx_, ARRAY_NAME, TILEDB_READ); + tiledb::Query read_query(ctx_, read_array, TILEDB_READ); + read_query.set_layout(TILEDB_ROW_MAJOR); + + // Set subarray for dense read + tiledb::Subarray subarray(ctx_, read_array); + subarray.add_range(0, 0, 99); + read_query.set_subarray(subarray); + + std::vector data(100); + read_query.set_data_buffer("a", data); + read_query.submit(); + read_array.close(); + + for (int i = 0; i < 100; i++) { + CHECK(data[i] == i * 10); + } + } +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: write and read roundtrip", + "[column-fragment-writer]") { + create_dense_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(100); // Use different timestamp + + // Initialize with non-empty domain [0, 9] which covers 1 tile (tile extent is + // 10) + int32_t domain_start = 0; + int32_t domain_end = 9; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + // Create writer + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 1); // 1 tile + + EncryptionKey enc_key; + + // Write attribute "a" + { + writer.open_field("a"); + + // Create tile with 10 int32_t values + auto memory_tracker = tiledb::test::get_test_memory_tracker(); + auto tile = WriterTileTuple( + *array_schema, + 10, // cell_num_per_tile + false, // var_size + false, // nullable + sizeof(int32_t), // cell_size + Datatype::INT32, // type + memory_tracker); + + // Write data to tile + std::vector data = { + 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000}; + tile.fixed_tile().write(data.data(), 0, data.size() * sizeof(int32_t)); + tile.set_final_size(10); + + // Compute metadata + TileMetadataGenerator md_gen( + Datatype::INT32, + false, // is_dim + false, // var_size + sizeof(int32_t), // cell_size + 1); // cell_val_num + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + // Filter and write the tile + filter_tile_for_test("a", tile, *array_schema, get_resources()); + writer.write_tile(tile); + + writer.close_field(); + } + + // Finalize fragment + writer.finalize(enc_key); + + // Read back and verify using standard API + { + tiledb::Array read_array(ctx_, ARRAY_NAME, TILEDB_READ); + tiledb::Query read_query(ctx_, read_array, TILEDB_READ); + read_query.set_layout(TILEDB_ROW_MAJOR); + + // Set subarray for dense read + tiledb::Subarray subarray(ctx_, read_array); + subarray.add_range(0, 0, 9); + read_query.set_subarray(subarray); + + std::vector result(10); + read_query.set_data_buffer("a", result); + read_query.submit(); + read_array.close(); + + CHECK(result[0] == 100); + CHECK(result[1] == 200); + CHECK(result[2] == 300); + CHECK(result[3] == 400); + CHECK(result[4] == 500); + CHECK(result[5] == 600); + CHECK(result[6] == 700); + CHECK(result[7] == 800); + CHECK(result[8] == 900); + CHECK(result[9] == 1000); + } +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: sparse array with MBRs", + "[column-fragment-writer]") { + create_sparse_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(200); + + // Initialize with non-empty domain covering the coordinates we'll write + int32_t domain_start = 10; + int32_t domain_end = 100; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + // Create writer for sparse array + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 1); // 1 tile + + // Prepare MBRs for the tile + std::vector mbrs; + NDRange mbr; + mbr.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + mbrs.push_back(mbr); + + EncryptionKey enc_key; + auto memory_tracker = tiledb::test::get_test_memory_tracker(); + + // Write dimension "d" + { + writer.open_field("d"); + + auto tile = WriterTileTuple( + *array_schema, + 10, // capacity + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + std::vector coords = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}; + tile.fixed_tile().write(coords.data(), 0, coords.size() * sizeof(int32_t)); + tile.set_final_size(10); + + TileMetadataGenerator md_gen( + Datatype::INT32, true, false, sizeof(int32_t), 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("d", tile, *array_schema, get_resources()); + writer.write_tile(tile); + writer.close_field(); + } + + // Write attribute "a" + { + writer.open_field("a"); + + auto tile = WriterTileTuple( + *array_schema, + 10, // capacity + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + std::vector data = { + 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000}; + tile.fixed_tile().write(data.data(), 0, data.size() * sizeof(int32_t)); + tile.set_final_size(10); + + TileMetadataGenerator md_gen( + Datatype::INT32, false, false, sizeof(int32_t), 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("a", tile, *array_schema, get_resources()); + writer.write_tile(tile); + writer.close_field(); + } + + // Finalize sparse fragment with MBRs + writer.finalize(enc_key, mbrs); + + // Read back and verify using standard API + { + tiledb::Array read_array(ctx_, ARRAY_NAME, TILEDB_READ); + tiledb::Query read_query(ctx_, read_array, TILEDB_READ); + read_query.set_layout(TILEDB_UNORDERED); + + std::vector coords(10); + std::vector data(10); + read_query.set_data_buffer("d", coords); + read_query.set_data_buffer("a", data); + read_query.submit(); + read_array.close(); + + // Verify coordinates + CHECK(coords[0] == 10); + CHECK(coords[1] == 20); + CHECK(coords[2] == 30); + CHECK(coords[3] == 40); + CHECK(coords[4] == 50); + CHECK(coords[5] == 60); + CHECK(coords[6] == 70); + CHECK(coords[7] == 80); + CHECK(coords[8] == 90); + CHECK(coords[9] == 100); + + // Verify attribute values + CHECK(data[0] == 1000); + CHECK(data[1] == 2000); + CHECK(data[2] == 3000); + CHECK(data[3] == 4000); + CHECK(data[4] == 5000); + CHECK(data[5] == 6000); + CHECK(data[6] == 7000); + CHECK(data[7] == 8000); + CHECK(data[8] == 9000); + CHECK(data[9] == 10000); + } +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: variable-size attribute write", + "[column-fragment-writer]") { + // This test verifies that var-size attributes can be written successfully. + // Full roundtrip with read is complex due to tile setup requirements, + // but we verify the write path works without errors. + create_varsize_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(300); + + // Verify var-size detection + CHECK(array_schema->var_size("a") == true); + + // Initialize with empty fragment (0 tiles) for simplicity + int32_t domain_start = 0; + int32_t domain_end = 0; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 0); // tile_count + + // Open and close fields without writing + writer.open_field("d"); + writer.close_field(); + writer.open_field("a"); + writer.close_field(); + + // Finalize with empty MBRs + EncryptionKey enc_key; + std::vector empty_mbrs; + writer.finalize(enc_key, empty_mbrs); + + // Verify fragment was created + CHECK(writer.fragment_metadata() != nullptr); +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: nullable attribute write", + "[column-fragment-writer]") { + // This test verifies that nullable attributes can be written successfully. + // Full roundtrip with read is complex due to tile setup requirements, + // but we verify the write path works without errors. + create_nullable_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(400); + + // Verify nullable detection + CHECK(array_schema->is_nullable("a") == true); + + // Initialize with empty fragment (0 tiles) for simplicity + int32_t domain_start = 0; + int32_t domain_end = 0; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 0); // tile_count + + // Open and close fields without writing + writer.open_field("d"); + writer.close_field(); + writer.open_field("a"); + writer.close_field(); + + // Finalize with empty MBRs + EncryptionKey enc_key; + std::vector empty_mbrs; + writer.finalize(enc_key, empty_mbrs); + + // Verify fragment was created + CHECK(writer.fragment_metadata() != nullptr); +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: multiple tiles per field", + "[column-fragment-writer]") { + create_multi_tile_dense_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(500); + + // Initialize with domain [0, 29] covering 3 tiles (tile extent is 10) + int32_t domain_start = 0; + int32_t domain_end = 29; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 3); // 3 tiles + + EncryptionKey enc_key; + auto memory_tracker = tiledb::test::get_test_memory_tracker(); + + // Write attribute "a" - 3 tiles + { + writer.open_field("a"); + + for (int tile_idx = 0; tile_idx < 3; tile_idx++) { + auto tile = WriterTileTuple( + *array_schema, + 10, + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + // Each tile has values tile_idx*1000 + i for i in 0..9 + std::vector data(10); + for (int i = 0; i < 10; i++) { + data[i] = tile_idx * 1000 + i; + } + tile.fixed_tile().write(data.data(), 0, data.size() * sizeof(int32_t)); + tile.set_final_size(10); + + TileMetadataGenerator md_gen( + Datatype::INT32, false, false, sizeof(int32_t), 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("a", tile, *array_schema, get_resources()); + writer.write_tile(tile); + } + + writer.close_field(); + } + + writer.finalize(enc_key); + + // Read back and verify + { + tiledb::Array read_array(ctx_, ARRAY_NAME, TILEDB_READ); + tiledb::Query read_query(ctx_, read_array, TILEDB_READ); + read_query.set_layout(TILEDB_ROW_MAJOR); + + tiledb::Subarray subarray(ctx_, read_array); + subarray.add_range(0, 0, 29); + read_query.set_subarray(subarray); + + std::vector result(30); + read_query.set_data_buffer("a", result); + read_query.submit(); + read_array.close(); + + // Verify all 30 values across 3 tiles + for (int tile_idx = 0; tile_idx < 3; tile_idx++) { + for (int i = 0; i < 10; i++) { + CHECK(result[tile_idx * 10 + i] == tile_idx * 1000 + i); + } + } + } +} + +TEST_CASE_METHOD( + ColumnFragmentWriterFx, + "ColumnFragmentWriter: tile count overflow error", + "[column-fragment-writer]") { + create_dense_array(); + + auto array_schema = get_array_schema(); + auto fragment_uri = generate_fragment_uri(600); + + int32_t domain_start = 0; + int32_t domain_end = 9; + NDRange non_empty_domain; + non_empty_domain.emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + + ColumnFragmentWriter writer( + &get_resources(), + array_schema, + fragment_uri, + non_empty_domain, + 1); // Only 1 tile allowed + + writer.open_field("a"); + auto memory_tracker = tiledb::test::get_test_memory_tracker(); + + // Write first tile - should succeed + { + auto tile = WriterTileTuple( + *array_schema, + 10, + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + std::vector data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + tile.fixed_tile().write(data.data(), 0, data.size() * sizeof(int32_t)); + tile.set_final_size(10); + + TileMetadataGenerator md_gen( + Datatype::INT32, false, false, sizeof(int32_t), 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("a", tile, *array_schema, get_resources()); + writer.write_tile(tile); + } + + // Write second tile - should fail (tile count limit reached) + { + auto tile = WriterTileTuple( + *array_schema, + 10, + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + std::vector data = {11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; + tile.fixed_tile().write(data.data(), 0, data.size() * sizeof(int32_t)); + tile.set_final_size(10); + + TileMetadataGenerator md_gen( + Datatype::INT32, false, false, sizeof(int32_t), 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("a", tile, *array_schema, get_resources()); + REQUIRE_THROWS_WITH( + writer.write_tile(tile), + Catch::Matchers::ContainsSubstring("tile count limit")); + } +} From 6cbdf35ed3e2e2dac4564b4af58a47d9769f4c8c Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Wed, 21 Jan 2026 18:17:31 +0200 Subject: [PATCH 3/9] Refactor some tests --- test/src/unit-column-fragment-writer.cc | 280 ++++++++++++++++++------ 1 file changed, 212 insertions(+), 68 deletions(-) diff --git a/test/src/unit-column-fragment-writer.cc b/test/src/unit-column-fragment-writer.cc index 0711323683d..d41d0d23334 100644 --- a/test/src/unit-column-fragment-writer.cc +++ b/test/src/unit-column-fragment-writer.cc @@ -389,42 +389,27 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( ColumnFragmentWriterFx, - "ColumnFragmentWriter: finalize with zero tiles (sparse)", + "ColumnFragmentWriter: sparse array requires MBRs", "[column-fragment-writer]") { - // Use sparse array - dense arrays require data for the entire domain create_sparse_array(); auto array_schema = get_array_schema(); auto fragment_uri = generate_fragment_uri(); int32_t domain_start = 0; - int32_t domain_end = 0; + int32_t domain_end = 99; NDRange non_empty_domain; non_empty_domain.emplace_back( Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( - &get_resources(), - array_schema, - fragment_uri, - non_empty_domain, - 0); // tile_count + &get_resources(), array_schema, fragment_uri, non_empty_domain, 1); - // Open and close fields without writing any tiles - writer.open_field("d"); - writer.close_field(); - writer.open_field("a"); - writer.close_field(); - - // Finalize sparse array with empty MBRs EncryptionKey enc_key; - std::vector empty_mbrs; - writer.finalize(enc_key, empty_mbrs); - - // Check that the commit file exists - URI array_uri(ARRAY_NAME); - URI commits_dir = array_uri.join_path(constants::array_commits_dir_name); - CHECK(vfs_.is_dir(commits_dir.to_string())); + REQUIRE_THROWS_WITH( + writer.finalize(enc_key), + Catch::Matchers::ContainsSubstring( + "Cannot finalize sparse array without MBRs")); } TEST_CASE_METHOD( @@ -762,90 +747,249 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( ColumnFragmentWriterFx, - "ColumnFragmentWriter: variable-size attribute write", + "ColumnFragmentWriter: var-size attribute roundtrip", "[column-fragment-writer]") { - // This test verifies that var-size attributes can be written successfully. - // Full roundtrip with read is complex due to tile setup requirements, - // but we verify the write path works without errors. create_varsize_array(); auto array_schema = get_array_schema(); auto fragment_uri = generate_fragment_uri(300); + auto memory_tracker = tiledb::test::get_test_memory_tracker(); - // Verify var-size detection CHECK(array_schema->var_size("a") == true); - // Initialize with empty fragment (0 tiles) for simplicity int32_t domain_start = 0; - int32_t domain_end = 0; + int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( - &get_resources(), - array_schema, - fragment_uri, - non_empty_domain, - 0); // tile_count - - // Open and close fields without writing - writer.open_field("d"); - writer.close_field(); - writer.open_field("a"); - writer.close_field(); + &get_resources(), array_schema, fragment_uri, non_empty_domain, 1); - // Finalize with empty MBRs EncryptionKey enc_key; - std::vector empty_mbrs; - writer.finalize(enc_key, empty_mbrs); + const uint64_t cell_num = 10; // matches sparse capacity - // Verify fragment was created - CHECK(writer.fragment_metadata() != nullptr); + // Write dimension + { + writer.open_field("d"); + + auto tile = WriterTileTuple( + *array_schema, + cell_num, + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + std::vector dim_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + tile.fixed_tile().write( + dim_data.data(), 0, dim_data.size() * sizeof(int32_t)); + tile.set_final_size(cell_num); + + TileMetadataGenerator md_gen( + Datatype::INT32, true, false, sizeof(int32_t), 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("d", tile, *array_schema, get_resources()); + writer.write_tile(tile); + writer.close_field(); + } + + // Write var-size attribute + { + writer.open_field("a"); + + auto tile = WriterTileTuple( + *array_schema, + cell_num, + true, + false, + 1, + Datatype::CHAR, + memory_tracker); + + std::vector strings = { + "hello", + "world", + "foo", + "bar", + "test", + "alpha", + "beta", + "gamma", + "delta", + "epsilon"}; + std::string var_data; + std::vector offsets; + for (const auto& s : strings) { + offsets.push_back(var_data.size()); + var_data += s; + } + + tile.offset_tile().write( + offsets.data(), 0, offsets.size() * sizeof(uint64_t)); + tile.var_tile().write_var(var_data.c_str(), 0, var_data.size()); + tile.var_tile().set_size(var_data.size()); + tile.set_final_size(cell_num); + + TileMetadataGenerator md_gen( + Datatype::STRING_ASCII, false, true, constants::var_num, 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("a", tile, *array_schema, get_resources()); + writer.write_tile(tile); + writer.close_field(); + } + + std::vector mbrs(1); + mbrs[0].emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + writer.finalize(enc_key, mbrs); + + // Read back and verify + { + tiledb::Array read_array(ctx_, ARRAY_NAME, TILEDB_READ); + tiledb::Query read_query(ctx_, read_array, TILEDB_READ); + read_query.set_layout(TILEDB_UNORDERED); + + std::vector dim_result(cell_num); + std::vector offsets_result(cell_num); + std::string data_result; + data_result.resize(200); + + read_query.set_data_buffer("d", dim_result); + read_query.set_data_buffer("a", data_result); + read_query.set_offsets_buffer("a", offsets_result); + read_query.submit(); + read_array.close(); + + auto result_num = read_query.result_buffer_elements()["a"]; + CHECK(result_num.first == cell_num); + CHECK(dim_result[0] == 0); + CHECK(dim_result[9] == 9); + CHECK(data_result.substr(offsets_result[0], 5) == "hello"); + CHECK(data_result.substr(offsets_result[9], 7) == "epsilon"); + } } TEST_CASE_METHOD( ColumnFragmentWriterFx, - "ColumnFragmentWriter: nullable attribute write", + "ColumnFragmentWriter: nullable attribute roundtrip", "[column-fragment-writer]") { - // This test verifies that nullable attributes can be written successfully. - // Full roundtrip with read is complex due to tile setup requirements, - // but we verify the write path works without errors. create_nullable_array(); auto array_schema = get_array_schema(); auto fragment_uri = generate_fragment_uri(400); + auto memory_tracker = tiledb::test::get_test_memory_tracker(); - // Verify nullable detection CHECK(array_schema->is_nullable("a") == true); - // Initialize with empty fragment (0 tiles) for simplicity int32_t domain_start = 0; - int32_t domain_end = 0; + int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( - &get_resources(), - array_schema, - fragment_uri, - non_empty_domain, - 0); // tile_count + &get_resources(), array_schema, fragment_uri, non_empty_domain, 1); - // Open and close fields without writing - writer.open_field("d"); - writer.close_field(); - writer.open_field("a"); - writer.close_field(); - - // Finalize with empty MBRs EncryptionKey enc_key; - std::vector empty_mbrs; - writer.finalize(enc_key, empty_mbrs); + const uint64_t cell_num = 10; // matches sparse capacity - // Verify fragment was created - CHECK(writer.fragment_metadata() != nullptr); + // Write dimension + { + writer.open_field("d"); + + auto tile = WriterTileTuple( + *array_schema, + cell_num, + false, + false, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + std::vector dim_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + tile.fixed_tile().write( + dim_data.data(), 0, dim_data.size() * sizeof(int32_t)); + tile.set_final_size(cell_num); + + TileMetadataGenerator md_gen( + Datatype::INT32, true, false, sizeof(int32_t), 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("d", tile, *array_schema, get_resources()); + writer.write_tile(tile); + writer.close_field(); + } + + // Write nullable attribute (values at odd indices are null) + { + writer.open_field("a"); + + auto tile = WriterTileTuple( + *array_schema, + cell_num, + false, + true, + sizeof(int32_t), + Datatype::INT32, + memory_tracker); + + std::vector data = {100, 0, 300, 0, 500, 600, 0, 800, 0, 1000}; + std::vector validity = {1, 0, 1, 0, 1, 1, 0, 1, 0, 1}; + + tile.fixed_tile().write(data.data(), 0, data.size() * sizeof(int32_t)); + tile.validity_tile().write(validity.data(), 0, validity.size()); + tile.set_final_size(cell_num); + + TileMetadataGenerator md_gen( + Datatype::INT32, false, false, sizeof(int32_t), 1); + md_gen.process_full_tile(tile); + md_gen.set_tile_metadata(tile); + + filter_tile_for_test("a", tile, *array_schema, get_resources()); + writer.write_tile(tile); + writer.close_field(); + } + + std::vector mbrs(1); + mbrs[0].emplace_back( + Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + writer.finalize(enc_key, mbrs); + + // Read back and verify + { + tiledb::Array read_array(ctx_, ARRAY_NAME, TILEDB_READ); + tiledb::Query read_query(ctx_, read_array, TILEDB_READ); + read_query.set_layout(TILEDB_UNORDERED); + + std::vector dim_result(cell_num); + std::vector data_result(cell_num); + std::vector validity_result(cell_num); + + read_query.set_data_buffer("d", dim_result); + read_query.set_data_buffer("a", data_result); + read_query.set_validity_buffer("a", validity_result); + read_query.submit(); + read_array.close(); + + CHECK(dim_result[0] == 0); + CHECK(dim_result[9] == 9); + + // Check validity and values + CHECK(validity_result[0] == 1); + CHECK(validity_result[1] == 0); + CHECK(validity_result[5] == 1); + CHECK(validity_result[6] == 0); + CHECK(data_result[0] == 100); + CHECK(data_result[5] == 600); + CHECK(data_result[9] == 1000); + } } TEST_CASE_METHOD( From edd891181737cf86b3572281452ec40777094b73 Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Wed, 21 Jan 2026 19:55:26 +0200 Subject: [PATCH 4/9] Tests update --- test/src/unit-column-fragment-writer.cc | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/test/src/unit-column-fragment-writer.cc b/test/src/unit-column-fragment-writer.cc index d41d0d23334..de976a118b3 100644 --- a/test/src/unit-column-fragment-writer.cc +++ b/test/src/unit-column-fragment-writer.cc @@ -277,7 +277,7 @@ TEST_CASE_METHOD( array_schema, fragment_uri, non_empty_domain, - 0); // tile_count + 1); // tile_count SECTION("Error: write_tile without open_field") { auto memory_tracker = tiledb::test::get_test_memory_tracker(); @@ -380,7 +380,7 @@ TEST_CASE_METHOD( array_schema, fragment_uri, non_empty_domain, - 0); // tile_count + 1); // tile_count // Test accessors CHECK(writer.fragment_uri() == fragment_uri); @@ -447,7 +447,7 @@ TEST_CASE_METHOD( auto fragment_uri = generate_fragment_uri(); int32_t domain_start = 0; - int32_t domain_end = 0; + int32_t domain_end = 99; NDRange non_empty_domain; non_empty_domain.emplace_back( Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); @@ -457,13 +457,7 @@ TEST_CASE_METHOD( array_schema, fragment_uri, non_empty_domain, - 0); // tile_count - - writer.open_field("d"); - writer.close_field(); - writer.open_field("a"); - writer.close_field(); - + 1); // tile_count EncryptionKey enc_key; REQUIRE_THROWS_WITH( writer.finalize(enc_key), From d5078b7dc3f4b5f49eb84758b917bd777d9b1c37 Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Wed, 21 Jan 2026 23:30:08 +0200 Subject: [PATCH 5/9] Use 3-argument `Range` constructor --- test/src/unit-column-fragment-writer.cc | 33 +++++++++++-------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/test/src/unit-column-fragment-writer.cc b/test/src/unit-column-fragment-writer.cc index de976a118b3..730d0769e70 100644 --- a/test/src/unit-column-fragment-writer.cc +++ b/test/src/unit-column-fragment-writer.cc @@ -270,7 +270,7 @@ TEST_CASE_METHOD( int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), @@ -337,7 +337,7 @@ TEST_CASE_METHOD( int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), @@ -373,7 +373,7 @@ TEST_CASE_METHOD( int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), @@ -400,7 +400,7 @@ TEST_CASE_METHOD( int32_t domain_end = 99; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), array_schema, fragment_uri, non_empty_domain, 1); @@ -426,7 +426,7 @@ TEST_CASE_METHOD( int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), array_schema, fragment_uri, non_empty_domain); @@ -450,7 +450,7 @@ TEST_CASE_METHOD( int32_t domain_end = 99; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), @@ -526,7 +526,7 @@ TEST_CASE_METHOD( int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); // Create writer ColumnFragmentWriter writer( @@ -622,7 +622,7 @@ TEST_CASE_METHOD( int32_t domain_end = 100; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); // Create writer for sparse array ColumnFragmentWriter writer( @@ -635,8 +635,7 @@ TEST_CASE_METHOD( // Prepare MBRs for the tile std::vector mbrs; NDRange mbr; - mbr.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + mbr.emplace_back(Range(&domain_start, &domain_end, sizeof(int32_t))); mbrs.push_back(mbr); EncryptionKey enc_key; @@ -755,7 +754,7 @@ TEST_CASE_METHOD( int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), array_schema, fragment_uri, non_empty_domain, 1); @@ -839,8 +838,7 @@ TEST_CASE_METHOD( } std::vector mbrs(1); - mbrs[0].emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + mbrs[0].emplace_back(Range(&domain_start, &domain_end, sizeof(int32_t))); writer.finalize(enc_key, mbrs); // Read back and verify @@ -885,7 +883,7 @@ TEST_CASE_METHOD( int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), array_schema, fragment_uri, non_empty_domain, 1); @@ -952,8 +950,7 @@ TEST_CASE_METHOD( } std::vector mbrs(1); - mbrs[0].emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + mbrs[0].emplace_back(Range(&domain_start, &domain_end, sizeof(int32_t))); writer.finalize(enc_key, mbrs); // Read back and verify @@ -1000,7 +997,7 @@ TEST_CASE_METHOD( int32_t domain_end = 29; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), @@ -1085,7 +1082,7 @@ TEST_CASE_METHOD( int32_t domain_end = 9; NDRange non_empty_domain; non_empty_domain.emplace_back( - Range(&domain_start, sizeof(int32_t), &domain_end, sizeof(int32_t))); + Range(&domain_start, &domain_end, sizeof(int32_t))); ColumnFragmentWriter writer( &get_resources(), From 434e089d38b2fa752a6911f04f7214702bb33ee9 Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Wed, 21 Jan 2026 23:56:12 +0200 Subject: [PATCH 6/9] Support dynamic tile count for sparse arrays --- tiledb/sm/fragment/column_fragment_writer.cc | 31 +++++++++++++++----- tiledb/sm/fragment/column_fragment_writer.h | 8 ++++- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/tiledb/sm/fragment/column_fragment_writer.cc b/tiledb/sm/fragment/column_fragment_writer.cc index 3dbd2c722b5..72a086a42f2 100644 --- a/tiledb/sm/fragment/column_fragment_writer.cc +++ b/tiledb/sm/fragment/column_fragment_writer.cc @@ -61,8 +61,10 @@ ColumnFragmentWriter::ColumnFragmentWriter( , fragment_uri_(fragment_uri) , dense_(array_schema->dense()) , current_tile_idx_(0) - , tile_num_(0) { - // For dense arrays, compute tile count from domain + , tile_num_(0) + , first_field_closed_(false) { + // For dense arrays, compute tile count from domain. + // For sparse arrays, use provided tile_count (0 means dynamic growth). if (dense_) { tile_num_ = array_schema->domain().tile_num(non_empty_domain); } else { @@ -91,9 +93,13 @@ ColumnFragmentWriter::ColumnFragmentWriter( false, // has_timestamps false); // has_delete_meta - // Initialize metadata with domain and tile count + // Initialize metadata with domain frag_meta_->init(non_empty_domain); - frag_meta_->set_num_tiles(tile_num_); + + // For dense or sparse with known tile count, pre-allocate metadata + if (tile_num_ > 0) { + frag_meta_->set_num_tiles(tile_num_); + } } /* ********************************* */ @@ -127,7 +133,13 @@ void ColumnFragmentWriter::write_tile(WriterTileTuple& tile) { "Cannot write tile: tile is not filtered"); } - if (current_tile_idx_ >= tile_num_) { + // For sparse arrays with dynamic growth (tile_num_=0 initially, first field), + // grow metadata using doubling strategy. + // After first field closes, tile_num_ is fixed. + if (!dense_ && !first_field_closed_ && current_tile_idx_ >= tile_num_) { + tile_num_ = (tile_num_ == 0) ? 1 : tile_num_ * 2; + frag_meta_->set_num_tiles(tile_num_); + } else if (current_tile_idx_ >= tile_num_) { throw ColumnFragmentWriterException( "Cannot write tile: tile count limit (" + std::to_string(tile_num_) + ") reached"); @@ -237,8 +249,13 @@ void ColumnFragmentWriter::close_field() { throw_if_not_ok(resources_->vfs().close_file(validity_uri)); } - // Verify correct number of tiles were written - if (current_tile_idx_ != tile_num_) { + // For sparse with dynamic growth, first closed field determines tile count. + // Resize to actual count (doubling may have over-allocated). + if (!dense_ && !first_field_closed_) { + tile_num_ = current_tile_idx_; + frag_meta_->set_num_tiles(tile_num_); + first_field_closed_ = true; + } else if (current_tile_idx_ != tile_num_) { throw ColumnFragmentWriterException( "Field '" + name + "' has " + std::to_string(current_tile_idx_) + " tiles but expected " + std::to_string(tile_num_)); diff --git a/tiledb/sm/fragment/column_fragment_writer.h b/tiledb/sm/fragment/column_fragment_writer.h index f76e2045ca9..c9716364e64 100644 --- a/tiledb/sm/fragment/column_fragment_writer.h +++ b/tiledb/sm/fragment/column_fragment_writer.h @@ -88,7 +88,9 @@ class ColumnFragmentWriter { * @param array_schema The schema of the array the fragment belongs to. * @param fragment_uri The fragment URI (must contain valid timestamps). * @param non_empty_domain The non-empty domain for this fragment. - * @param tile_count Number of tiles (required for sparse, ignored for dense). + * @param tile_count Number of tiles for sparse arrays. If 0, tile count is + * determined dynamically by the first field written (for streaming). + * Ignored for dense arrays (computed from domain). */ ColumnFragmentWriter( ContextResources* resources, @@ -204,6 +206,10 @@ class ColumnFragmentWriter { /** Number of tiles to be written. */ uint64_t tile_num_; + + /** Whether the first field has been closed (for sparse dynamic tile count). + */ + bool first_field_closed_; }; } // namespace tiledb::sm From f7711506f4b23d99365111fe161bad70b33cd190 Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Wed, 21 Jan 2026 23:56:21 +0200 Subject: [PATCH 7/9] Update test --- test/src/unit-column-fragment-writer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/src/unit-column-fragment-writer.cc b/test/src/unit-column-fragment-writer.cc index 730d0769e70..bef59fced23 100644 --- a/test/src/unit-column-fragment-writer.cc +++ b/test/src/unit-column-fragment-writer.cc @@ -624,13 +624,13 @@ TEST_CASE_METHOD( non_empty_domain.emplace_back( Range(&domain_start, &domain_end, sizeof(int32_t))); - // Create writer for sparse array + // Create writer for sparse array (tile_count=0 for dynamic growth) ColumnFragmentWriter writer( &get_resources(), array_schema, fragment_uri, non_empty_domain, - 1); // 1 tile + 0); // dynamic tile count // Prepare MBRs for the tile std::vector mbrs; From b842c38a9be97dac17098c48b9031f3bf433d4eb Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Thu, 22 Jan 2026 16:18:01 +0200 Subject: [PATCH 8/9] Fix attempt --- tiledb/sm/fragment/column_fragment_writer.cc | 2 +- tiledb/sm/tile/writer_tile_tuple.cc | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tiledb/sm/fragment/column_fragment_writer.cc b/tiledb/sm/fragment/column_fragment_writer.cc index 72a086a42f2..d5c6268b2f5 100644 --- a/tiledb/sm/fragment/column_fragment_writer.cc +++ b/tiledb/sm/fragment/column_fragment_writer.cc @@ -191,7 +191,7 @@ void ColumnFragmentWriter::write_tile(WriterTileTuple& tile) { name, current_tile_idx_, tile.max().size()); } } else { - if (has_min_max_md && null_count != cell_num) { + if (has_min_max_md && null_count != cell_num && !tile.min().empty()) { frag_meta_->set_tile_min(name, current_tile_idx_, tile.min()); frag_meta_->set_tile_max(name, current_tile_idx_, tile.max()); } diff --git a/tiledb/sm/tile/writer_tile_tuple.cc b/tiledb/sm/tile/writer_tile_tuple.cc index e6e823ec153..5533045ab5f 100644 --- a/tiledb/sm/tile/writer_tile_tuple.cc +++ b/tiledb/sm/tile/writer_tile_tuple.cc @@ -115,6 +115,17 @@ void WriterTileTuple::set_metadata( } std::optional WriterTileTuple::filtered_size() const { + // Check if the main tile is filtered. If not, return nullopt. + if (var_size()) { + if (!offset_tile().filtered()) { + return std::nullopt; + } + } else { + if (!fixed_tile().filtered()) { + return std::nullopt; + } + } + uint64_t tile_size = 0; if (var_size()) { tile_size += offset_tile().filtered_buffer().size(); From cbfd295145f43cc8446db9f4d3e28da1f739a2dc Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Fri, 30 Jan 2026 15:42:36 +0200 Subject: [PATCH 9/9] Comments --- test/src/unit-column-fragment-writer.cc | 44 +++++++++++------- tiledb/sm/fragment/column_fragment_writer.cc | 49 ++++++++++---------- tiledb/sm/fragment/column_fragment_writer.h | 38 +++++++++------ 3 files changed, 75 insertions(+), 56 deletions(-) diff --git a/test/src/unit-column-fragment-writer.cc b/test/src/unit-column-fragment-writer.cc index bef59fced23..f10b55f82cf 100644 --- a/test/src/unit-column-fragment-writer.cc +++ b/test/src/unit-column-fragment-writer.cc @@ -357,7 +357,9 @@ TEST_CASE_METHOD( memory_tracker); writer.open_field("a"); - REQUIRE_THROWS(writer.write_tile(tile)); + REQUIRE_THROWS_WITH( + writer.write_tile(tile), + Catch::Matchers::ContainsSubstring("tile is not filtered")); } TEST_CASE_METHOD( @@ -408,8 +410,7 @@ TEST_CASE_METHOD( EncryptionKey enc_key; REQUIRE_THROWS_WITH( writer.finalize(enc_key), - Catch::Matchers::ContainsSubstring( - "Cannot finalize sparse array without MBRs")); + Catch::Matchers::ContainsSubstring("Call set_mbrs() first")); } TEST_CASE_METHOD( @@ -431,16 +432,15 @@ TEST_CASE_METHOD( ColumnFragmentWriter writer( &get_resources(), array_schema, fragment_uri, non_empty_domain); - // No need to write tiles - just test that finalize with MBRs throws - EncryptionKey enc_key; + // No need to write tiles - just test that set_mbrs throws for dense std::vector mbrs; REQUIRE_THROWS_WITH( - writer.finalize(enc_key, mbrs), + writer.set_mbrs(std::move(mbrs)), Catch::Matchers::ContainsSubstring( "Dense arrays should not provide MBRs")); } - SECTION("Sparse array must use finalize with MBRs") { + SECTION("Sparse array must call set_mbrs before finalize") { create_sparse_array(); auto array_schema = get_array_schema(); @@ -461,8 +461,7 @@ TEST_CASE_METHOD( EncryptionKey enc_key; REQUIRE_THROWS_WITH( writer.finalize(enc_key), - Catch::Matchers::ContainsSubstring( - "Cannot finalize sparse array without MBRs")); + Catch::Matchers::ContainsSubstring("Call set_mbrs() first")); } } @@ -513,7 +512,7 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( ColumnFragmentWriterFx, - "ColumnFragmentWriter: write and read roundtrip", + "ColumnFragmentWriter: write and read roundtrip dense array one tile", "[column-fragment-writer]") { create_dense_array(); @@ -668,6 +667,9 @@ TEST_CASE_METHOD( writer.close_field(); } + // Set MBRs after processing dimensions (allows freeing intermediate memory) + writer.set_mbrs(std::move(mbrs)); + // Write attribute "a" { writer.open_field("a"); @@ -696,8 +698,8 @@ TEST_CASE_METHOD( writer.close_field(); } - // Finalize sparse fragment with MBRs - writer.finalize(enc_key, mbrs); + // Finalize sparse fragment + writer.finalize(enc_key); // Read back and verify using standard API { @@ -790,6 +792,11 @@ TEST_CASE_METHOD( writer.close_field(); } + // Set MBRs after processing dimensions + std::vector mbrs(1); + mbrs[0].emplace_back(Range(&domain_start, &domain_end, sizeof(int32_t))); + writer.set_mbrs(std::move(mbrs)); + // Write var-size attribute { writer.open_field("a"); @@ -837,9 +844,7 @@ TEST_CASE_METHOD( writer.close_field(); } - std::vector mbrs(1); - mbrs[0].emplace_back(Range(&domain_start, &domain_end, sizeof(int32_t))); - writer.finalize(enc_key, mbrs); + writer.finalize(enc_key); // Read back and verify { @@ -919,6 +924,11 @@ TEST_CASE_METHOD( writer.close_field(); } + // Set MBRs after processing dimensions + std::vector mbrs(1); + mbrs[0].emplace_back(Range(&domain_start, &domain_end, sizeof(int32_t))); + writer.set_mbrs(std::move(mbrs)); + // Write nullable attribute (values at odd indices are null) { writer.open_field("a"); @@ -949,9 +959,7 @@ TEST_CASE_METHOD( writer.close_field(); } - std::vector mbrs(1); - mbrs[0].emplace_back(Range(&domain_start, &domain_end, sizeof(int32_t))); - writer.finalize(enc_key, mbrs); + writer.finalize(enc_key); // Read back and verify { diff --git a/tiledb/sm/fragment/column_fragment_writer.cc b/tiledb/sm/fragment/column_fragment_writer.cc index d5c6268b2f5..0679746447d 100644 --- a/tiledb/sm/fragment/column_fragment_writer.cc +++ b/tiledb/sm/fragment/column_fragment_writer.cc @@ -62,7 +62,8 @@ ColumnFragmentWriter::ColumnFragmentWriter( , dense_(array_schema->dense()) , current_tile_idx_(0) , tile_num_(0) - , first_field_closed_(false) { + , first_field_closed_(false) + , mbrs_set_(false) { // For dense arrays, compute tile count from domain. // For sparse arrays, use provided tile_count (0 means dynamic growth). if (dense_) { @@ -265,43 +266,43 @@ void ColumnFragmentWriter::close_field() { current_tile_idx_ = 0; } -/* ********************************* */ -/* FRAGMENT-LEVEL OPERATIONS */ -/* ********************************* */ +void ColumnFragmentWriter::set_mbrs(std::vector&& mbrs) { + if (dense_) { + throw ColumnFragmentWriterException("Dense arrays should not provide MBRs"); + } -void ColumnFragmentWriter::finalize(const EncryptionKey& encryption_key) { - if (!current_field_.empty()) { + if (mbrs.size() != tile_num_) { throw ColumnFragmentWriterException( - "Cannot finalize: field '" + current_field_ + "' is still open"); + "Sparse array requires " + std::to_string(tile_num_) + + " MBRs but got " + std::to_string(mbrs.size())); } - if (!dense_) { - throw ColumnFragmentWriterException( - "Cannot finalize sparse array without MBRs"); + mbrs_ = std::move(mbrs); + mbrs_set_ = true; + + // Set MBRs in fragment metadata immediately so memory can be managed + for (uint64_t i = 0; i < mbrs_.size(); ++i) { + frag_meta_->set_mbr(i, mbrs_[i]); } - finalize_internal(encryption_key); + // Clear local storage since metadata now owns the MBRs + mbrs_.clear(); + mbrs_.shrink_to_fit(); } -void ColumnFragmentWriter::finalize( - const EncryptionKey& encryption_key, const std::vector& mbrs) { +/* ********************************* */ +/* FRAGMENT-LEVEL OPERATIONS */ +/* ********************************* */ + +void ColumnFragmentWriter::finalize(const EncryptionKey& encryption_key) { if (!current_field_.empty()) { throw ColumnFragmentWriterException( "Cannot finalize: field '" + current_field_ + "' is still open"); } - if (dense_) { - throw ColumnFragmentWriterException("Dense arrays should not provide MBRs"); - } - - if (mbrs.size() != tile_num_) { + if (!dense_ && !mbrs_set_) { throw ColumnFragmentWriterException( - "Sparse array requires " + std::to_string(tile_num_) + - " MBRs but got " + std::to_string(mbrs.size())); - } - - for (uint64_t i = 0; i < mbrs.size(); ++i) { - frag_meta_->set_mbr(i, mbrs[i]); + "Cannot finalize sparse array without MBRs. Call set_mbrs() first."); } finalize_internal(encryption_key); diff --git a/tiledb/sm/fragment/column_fragment_writer.h b/tiledb/sm/fragment/column_fragment_writer.h index c9716364e64..15fb2ee1ad1 100644 --- a/tiledb/sm/fragment/column_fragment_writer.h +++ b/tiledb/sm/fragment/column_fragment_writer.h @@ -67,7 +67,8 @@ class ColumnFragmentWriterException : public StatusException { * a. Call open_field(name) * b. Call write_tile() for each pre-filtered tile * c. Call close_field() - * 3. Call finalize(key) for dense or finalize(key, mbrs) for sparse + * 3. For sparse arrays, call set_mbrs() after processing dimensions + * 4. Call finalize(key) */ class ColumnFragmentWriter { public: @@ -134,29 +135,32 @@ class ColumnFragmentWriter { */ void close_field(); + /** + * Sets the MBRs for a sparse fragment. Should be called after processing + * dimensions and before finalize(). This allows freeing intermediate MBR + * computation memory before processing attributes. + * + * @param mbrs MBRs for sparse arrays (one per tile). Ownership is + * transferred. + * @throws ColumnFragmentWriterException if this is a dense array, + * or if MBR count doesn't match tile count. + */ + void set_mbrs(std::vector&& mbrs); + /* ********************************* */ /* FRAGMENT-LEVEL OPERATIONS */ /* ********************************* */ /** - * Finalizes a dense fragment. Stores metadata and creates commit file. + * Finalizes the fragment. Stores metadata and creates commit file. * - * @param encryption_key The encryption key for storing metadata. - * @throws ColumnFragmentWriterException if a field is still open, - * or if this is a sparse array. - */ - void finalize(const EncryptionKey& encryption_key); - - /** - * Finalizes a sparse fragment. Stores metadata and creates commit file. + * For sparse arrays, set_mbrs() must be called before finalize(). * * @param encryption_key The encryption key for storing metadata. - * @param mbrs MBRs for sparse arrays (one per tile). * @throws ColumnFragmentWriterException if a field is still open, - * if this is a dense array, or if MBR count doesn't match tile count. + * or if this is a sparse array without MBRs set. */ - void finalize( - const EncryptionKey& encryption_key, const std::vector& mbrs); + void finalize(const EncryptionKey& encryption_key); /* ********************************* */ /* ACCESSORS */ @@ -210,6 +214,12 @@ class ColumnFragmentWriter { /** Whether the first field has been closed (for sparse dynamic tile count). */ bool first_field_closed_; + + /** MBRs for sparse arrays (set via set_mbrs()). */ + std::vector mbrs_; + + /** Whether MBRs have been set. */ + bool mbrs_set_; }; } // namespace tiledb::sm