diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ec58ac727e5..6d04278eb25 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -16,6 +16,7 @@ # under the License. from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New +from pyarrow.includes.libarrow_python cimport HasNumPyStringDType, StringConversionMode from collections.abc import Sequence import os @@ -65,6 +66,30 @@ def _ndarray_to_arrow_type(object values, DataType type): return pyarrow_wrap_data_type(_ndarray_to_type(values, type)) +cdef inline StringConversionMode _resolve_string_conversion_mode(object string_dtype): + if string_dtype is True: + return StringConversionMode_STRING_DTYPE + if string_dtype is False: + return StringConversionMode_PYTHON_OBJECT + + if string_dtype is None: + return StringConversionMode_PYTHON_OBJECT + + if isinstance(string_dtype, str): + option = string_dtype.lower() + if option == "auto": + return StringConversionMode_PYTHON_OBJECT + if option in ("numpy", "string", "stringdtype"): + return StringConversionMode_STRING_DTYPE + if option in ("python", "object"): + return StringConversionMode_PYTHON_OBJECT + + raise ValueError( + "string_dtype must be one of 'auto', 'numpy', 'python', 'object', " + "True or False" + ) + + cdef shared_ptr[CDataType] _ndarray_to_type(object values, DataType type) except *: cdef shared_ptr[CDataType] c_type @@ -1734,7 +1759,7 @@ cdef class Array(_PandasConvertible): return values return np.asarray(values, dtype=dtype) - def to_numpy(self, zero_copy_only=True, writable=False): + def to_numpy(self, zero_copy_only=True, writable=False, *, string_dtype="auto"): """ Return a NumPy view or copy of this array. @@ -1757,6 +1782,14 @@ cdef class Array(_PandasConvertible): By setting this to True, a copy of the array is made to ensure it is writable. + string_dtype : {"auto", "numpy", "python", "object", True, False}, default "auto" + Controls how string-like arrays are converted when NumPy 2.0's + :class:`~numpy.typing.StringDType` is available. ``"numpy"`` or + ``True`` will request StringDType (copying), ``"python"``/``"object"`` + or ``False`` will force Python object dtype. ``"auto"`` preserves the + default object dtype unless StringDType is explicitly requested. + Converting to NumPy's StringDType always copies string data. + Returns ------- array : numpy.ndarray @@ -1775,6 +1808,11 @@ cdef class Array(_PandasConvertible): raise ValueError( "Cannot return a writable array if asking for zero-copy") + c_options.string_conversion_mode = _resolve_string_conversion_mode(string_dtype) + if c_options.string_conversion_mode == StringConversionMode_STRING_DTYPE: + if not HasNumPyStringDType(): + raise NotImplementedError("NumPy StringDType not available") + # If there are nulls and the array is a DictionaryArray # decoding the dictionary will make sure nulls are correctly handled. # Decoding a dictionary does imply a copy by the way, diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 4724c52ccb5..a1cb237ad7c 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -161,6 +161,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[CTable] table, PyObject** out) + c_bool HasNumPyStringDType() + void c_set_default_memory_pool \ " arrow::py::set_default_memory_pool"(CMemoryPool* pool)\ @@ -182,6 +184,11 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: cdef cppclass PyOutputStream(COutputStream): PyOutputStream(object fo) + cdef enum StringConversionMode "arrow::py::PandasOptions::StringConversionMode": + StringConversionMode_AUTO "arrow::py::PandasOptions::StringConversionMode::AUTO" + StringConversionMode_STRING_DTYPE "arrow::py::PandasOptions::StringConversionMode::STRING_DTYPE" + StringConversionMode_PYTHON_OBJECT "arrow::py::PandasOptions::StringConversionMode::PYTHON_OBJECT" + cdef cppclass PandasOptions: CMemoryPool* pool c_bool strings_to_categorical @@ -201,6 +208,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: shared_ptr[const unordered_set[c_string]] categorical_columns shared_ptr[const unordered_set[c_string]] extension_columns c_bool to_numpy + StringConversionMode string_conversion_mode cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index f163266f3b8..4e699381b65 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -32,12 +32,14 @@ #include #include "arrow/array.h" +#include "arrow/array/array_binary.h" #include "arrow/buffer.h" #include "arrow/datum.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/bit_run_reader.h" #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/int_util.h" @@ -68,6 +70,16 @@ using internal::CheckIndexBounds; using internal::OptionalParallelFor; namespace py { + +ARROW_PYTHON_EXPORT bool HasNumPyStringDType() { +#if NPY_ABI_VERSION >= 0x02000000 + auto* dtype_table = reinterpret_cast(PyArray_API + 320); + return dtype_table[39] != nullptr; +#else + return false; +#endif +} + namespace { // Fix options for conversion of an inner (child) array. @@ -344,6 +356,7 @@ class PandasWriter { public: enum type { OBJECT, + STRING_DTYPE, UINT8, INT8, UINT16, @@ -1405,6 +1418,241 @@ class ObjectWriter : public TypedPandasWriter { } }; +#if NPY_ABI_VERSION >= 0x02000000 +inline npy_string_allocator* ArrowNpyString_acquire_allocator( + const PyArray_StringDTypeObject* descr) { + using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); + return reinterpret_cast(PyArray_API[316])(descr); +} + +inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { + using Func = void (*)(npy_string_allocator*); + reinterpret_cast(PyArray_API[318])(allocator); +} + +inline int ArrowNpyString_pack(npy_string_allocator* allocator, + npy_packed_static_string* packed, const char* data, + size_t length) { + using Func = + int (*)(npy_string_allocator*, npy_packed_static_string*, const char*, size_t); + return reinterpret_cast(PyArray_API[314])(allocator, packed, data, length); +} + +inline int ArrowNpyString_pack_null(npy_string_allocator* allocator, + npy_packed_static_string* packed) { + using Func = int (*)(npy_string_allocator*, npy_packed_static_string*); + return reinterpret_cast(PyArray_API[315])(allocator, packed); +} + +Status PackStringValue(npy_string_allocator* allocator, npy_packed_static_string* packed, + const std::string_view& view) { + const int result = ArrowNpyString_pack(allocator, packed, view.data(), view.size()); + if (result == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to pack NumPy StringDType value"); + } + return Status::OK(); +} + +Status PackNullString(npy_string_allocator* allocator, npy_packed_static_string* packed) { + const int result = ArrowNpyString_pack_null(allocator, packed); + if (result == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to pack NumPy StringDType value"); + } + return Status::OK(); +} + +template +Status WriteOffsetStringValues(const ArrayType& arr, npy_string_allocator* allocator, + char* data, npy_intp stride) { + using offset_type = typename ArrayType::offset_type; + + const offset_type* offsets = arr.raw_value_offsets(); + const auto base_offset = offsets[0]; + const uint8_t* value_data = arr.value_data()->data(); + const uint8_t* validity = arr.null_bitmap_data(); + + auto pack_values = [&](int64_t position, int64_t length) -> Status { + for (int64_t i = 0; i < length; ++i) { + const auto start = static_cast(offsets[position + i] - base_offset); + const auto end = static_cast(offsets[position + i + 1] - base_offset); + auto* packed = + reinterpret_cast(data + (position + i) * stride); + RETURN_NOT_OK(PackStringValue( + allocator, packed, + std::string_view(reinterpret_cast(value_data + start), + end - start))); + } + return Status::OK(); + }; + + auto pack_nulls = [&](int64_t position, int64_t length) -> Status { + for (int64_t i = 0; i < length; ++i) { + auto* packed = + reinterpret_cast(data + (position + i) * stride); + RETURN_NOT_OK(PackNullString(allocator, packed)); + } + return Status::OK(); + }; + + if (arr.null_count() == 0) { + return pack_values(/*position=*/0, arr.length()); + } + + if (validity == nullptr) { + for (int64_t i = 0; i < arr.length(); ++i) { + auto* packed = reinterpret_cast(data + i * stride); + if (arr.IsNull(i)) { + RETURN_NOT_OK(PackNullString(allocator, packed)); + } else { + const auto start = static_cast(offsets[i] - base_offset); + const auto end = static_cast(offsets[i + 1] - base_offset); + RETURN_NOT_OK(PackStringValue( + allocator, packed, + std::string_view(reinterpret_cast(value_data + start), + end - start))); + } + } + } else { + arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); + int64_t position = 0; + auto run = reader.NextRun(); + while (run.length > 0) { + if (run.set) { + RETURN_NOT_OK(pack_values(position, run.length)); + } else { + RETURN_NOT_OK(pack_nulls(position, run.length)); + } + position += run.length; + run = reader.NextRun(); + } + } + + return Status::OK(); +} + +template +Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocator, + char* data, npy_intp stride) { + const uint8_t* validity = arr.null_bitmap_data(); + + auto pack_values = [&](int64_t position, int64_t length) -> Status { + for (int64_t i = 0; i < length; ++i) { + auto* packed = + reinterpret_cast(data + (position + i) * stride); + const auto view = arr.GetView(position + i); + RETURN_NOT_OK(PackStringValue(allocator, packed, view)); + } + return Status::OK(); + }; + + auto pack_nulls = [&](int64_t position, int64_t length) -> Status { + for (int64_t i = 0; i < length; ++i) { + auto* packed = + reinterpret_cast(data + (position + i) * stride); + RETURN_NOT_OK(PackNullString(allocator, packed)); + } + return Status::OK(); + }; + + if (arr.null_count() == 0) { + return pack_values(/*position=*/0, arr.length()); + } + + if (validity == nullptr) { + for (int64_t i = 0; i < arr.length(); ++i) { + auto* packed = reinterpret_cast(data + i * stride); + if (arr.IsNull(i)) { + RETURN_NOT_OK(PackNullString(allocator, packed)); + } else { + const auto view = arr.GetView(i); + RETURN_NOT_OK(PackStringValue(allocator, packed, view)); + } + } + } else { + arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length()); + int64_t position = 0; + auto run = reader.NextRun(); + while (run.length > 0) { + if (run.set) { + RETURN_NOT_OK(pack_values(position, run.length)); + } else { + RETURN_NOT_OK(pack_nulls(position, run.length)); + } + position += run.length; + run = reader.NextRun(); + } + } + + return Status::OK(); +} + +class StringDTypeWriter : public PandasWriter { + public: + using PandasWriter::PandasWriter; + + Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { + ARROW_UNUSED(py_ref); + RETURN_NOT_OK(CheckNotZeroCopyOnly(*data)); + RETURN_NOT_OK(EnsureAllocated()); + return CopyInto(std::move(data), /*rel_placement=*/0); + } + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + RETURN_NOT_OK(CheckNotZeroCopyOnly(*data)); + + PyAcquireGIL lock; + auto* np_arr = reinterpret_cast(block_arr_.obj()); + auto* descr = reinterpret_cast(PyArray_DESCR(np_arr)); + + npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr); + if (allocator == nullptr) { + return Status::Invalid("Failed to acquire NumPy StringDType allocator"); + } + struct AllocatorGuard { + npy_string_allocator* allocator; + explicit AllocatorGuard(npy_string_allocator* alloc) : allocator(alloc) {} + ~AllocatorGuard() { ArrowNpyString_release_allocator(allocator); } + } guard(allocator); + + const npy_intp row_stride = PyArray_STRIDES(np_arr)[1]; + char* data_start = PyArray_BYTES(np_arr) + rel_placement * PyArray_STRIDES(np_arr)[0]; + int64_t offset = 0; + + for (const auto& chunk : data->chunks()) { + char* chunk_data = data_start + offset * row_stride; + switch (data->type()->id()) { + case Type::STRING: { + const auto& arr = checked_cast(*chunk); + RETURN_NOT_OK(WriteOffsetStringValues(arr, allocator, chunk_data, row_stride)); + break; + } + case Type::LARGE_STRING: { + const auto& arr = checked_cast(*chunk); + RETURN_NOT_OK(WriteOffsetStringValues(arr, allocator, chunk_data, row_stride)); + break; + } + case Type::STRING_VIEW: { + const auto& arr = checked_cast(*chunk); + RETURN_NOT_OK(WriteViewStringValues(arr, allocator, chunk_data, row_stride)); + break; + } + default: + return Status::TypeError("Expected an Arrow string array, got ", + data->type()->ToString()); + } + offset += chunk->length(); + } + + return Status::OK(); + } + + protected: + Status Allocate() override { return AllocateNDArray(NPY_VSTRING); } +}; +#endif + static inline bool IsNonNullContiguous(const ChunkedArray& data) { return data.num_chunks() == 1 && data.null_count() == 0; } @@ -2056,6 +2304,11 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, case PandasWriter::EXTENSION: *writer = std::make_shared(options, num_rows, num_columns); break; +#if NPY_ABI_VERSION >= 0x02000000 + case PandasWriter::STRING_DTYPE: + *writer = std::make_shared(options, num_rows, num_columns); + break; +#endif BLOCK_CASE(OBJECT, ObjectWriter); BLOCK_CASE(UINT8, UInt8Writer); BLOCK_CASE(INT8, Int8Writer); @@ -2130,10 +2383,22 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::DOUBLE: *output_type = PandasWriter::DOUBLE; break; - case Type::STRING: // fall through - case Type::LARGE_STRING: // fall through - case Type::STRING_VIEW: // fall through - case Type::BINARY: // fall through + case Type::STRING: // fall through + case Type::LARGE_STRING: // fall through + case Type::STRING_VIEW: { // fall through +#if NPY_ABI_VERSION >= 0x02000000 + if (options.to_numpy && options.string_conversion_mode == + PandasOptions::StringConversionMode::STRING_DTYPE) { + // NumPy's StringDType allocator always copies string data, so zero-copy + // requests must continue to route through the object-dtype path. + *output_type = PandasWriter::STRING_DTYPE; + break; + } +#endif + *output_type = PandasWriter::OBJECT; + break; + } + case Type::BINARY: // fall through case Type::LARGE_BINARY: case Type::BINARY_VIEW: case Type::NA: // fall through diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h b/python/pyarrow/src/arrow/python/arrow_to_pandas.h index b4e91e6cf5a..ce45f4f3456 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h @@ -140,6 +140,12 @@ struct PandasOptions { // Used internally to decipher between to_numpy() and to_pandas() when // the expected output differs bool to_numpy = false; + + enum class StringConversionMode { AUTO, STRING_DTYPE, PYTHON_OBJECT }; + + // Controls how string-like Arrow arrays are converted when calling + // Array.to_numpy/ChunkedArray.to_numpy + StringConversionMode string_conversion_mode = StringConversionMode::PYTHON_OBJECT; }; ARROW_PYTHON_EXPORT @@ -161,5 +167,7 @@ ARROW_PYTHON_EXPORT Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr table, PyObject** out); +ARROW_PYTHON_EXPORT bool HasNumPyStringDType(); + } // namespace py } // namespace arrow diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index 4113cc67d2f..facad8adfc8 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -37,6 +37,10 @@ namespace arrow { namespace py { +#ifndef NPY_VSTRING +# define NPY_VSTRING 2056 +#endif + NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) { PyAcquireGIL lock; arr_ = ao; @@ -122,6 +126,10 @@ Result> NumPyScalarToArrowDataType(PyObject* scalar) { return NumPyDtypeToArrow(descr); } +bool IsStringDType(PyArray_Descr* descr) { + return descr != nullptr && descr->type_num == NPY_VSTRING; +} + Result> NumPyDtypeToArrow(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { return Status::TypeError("Did not pass numpy.dtype object"); @@ -133,6 +141,10 @@ Result> NumPyDtypeToArrow(PyObject* dtype) { Result> NumPyDtypeToArrow(PyArray_Descr* descr) { int type_num = fix_numpy_type_num(descr->type_num); + if (IsStringDType(descr)) { + return utf8(); + } + switch (type_num) { TO_ARROW_TYPE_CASE(BOOL, boolean); TO_ARROW_TYPE_CASE(INT8, int8); diff --git a/python/pyarrow/src/arrow/python/numpy_convert.h b/python/pyarrow/src/arrow/python/numpy_convert.h index 2d1086e1355..cac389d17a1 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.h +++ b/python/pyarrow/src/arrow/python/numpy_convert.h @@ -55,6 +55,8 @@ Result> NumPyDtypeToArrow(PyArray_Descr* descr); ARROW_PYTHON_EXPORT Result> NumPyScalarToArrowDataType(PyObject* scalar); +ARROW_PYTHON_EXPORT bool IsStringDType(PyArray_Descr* descr); + ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, const std::vector& dim_names, std::shared_ptr* out); diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 5647e895d0f..dfbdd25a026 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,8 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" +#include + namespace arrow { using internal::checked_cast; @@ -74,6 +77,27 @@ using internal::NumPyTypeSize; namespace { +#if NPY_ABI_VERSION >= 0x02000000 +inline npy_string_allocator* ArrowNpyString_acquire_allocator( + const PyArray_StringDTypeObject* descr) { + using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); + return reinterpret_cast(PyArray_API[316])(descr); +} + +inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { + using Func = void (*)(npy_string_allocator*); + reinterpret_cast(PyArray_API[318])(allocator); +} + +inline int ArrowNpyString_load(npy_string_allocator* allocator, + const npy_packed_static_string* packed, + npy_static_string* out) { + using Func = + int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); + return reinterpret_cast(PyArray_API[313])(allocator, packed, out); +} +#endif + Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { int64_t null_bytes = bit_util::BytesForBits(length); @@ -233,6 +257,13 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); +#if NPY_ABI_VERSION >= 0x02000000 + template + Status AppendStringDTypeValues(Builder* builder); + + Status ConvertStringDType(); +#endif + Status Visit(const StructType& type); Status Visit(const FixedSizeBinaryType& type); @@ -338,6 +369,16 @@ Status NumPyConverter::Convert() { return Status::OK(); } + if (IsStringDType(dtype_)) { +#if NPY_ABI_VERSION >= 0x02000000 + RETURN_NOT_OK(ConvertStringDType()); + return Status::OK(); +#else + return Status::NotImplemented( + "NumPy StringDType requires building PyArrow with NumPy >= 2.0"); +#endif + } + if (type_ == nullptr) { return Status::Invalid("Must pass data type for non-object arrays"); } @@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } +#if NPY_ABI_VERSION >= 0x02000000 +template +Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { + auto* descr = reinterpret_cast(dtype_); + + npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr); + if (allocator == nullptr) { + return Status::Invalid("Failed to acquire NumPy StringDType allocator"); + } + + struct AllocatorGuard { + npy_string_allocator* ptr; + explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {} + ~AllocatorGuard() { + if (ptr != nullptr) { + ArrowNpyString_release_allocator(ptr); + } + } + } guard(allocator); + + npy_static_string value = {0, nullptr}; + char* data = PyArray_BYTES(arr_); + + if (mask_ != nullptr) { + Ndarray1DIndexer mask_values(mask_); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } + + const auto* packed = + reinterpret_cast(data + i * stride_); + const int is_null = ArrowNpyString_load(allocator, packed, &value); + if (is_null == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (is_null) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); + } + } + return Status::OK(); + } + + for (int64_t i = 0; i < length_; ++i) { + const auto* packed = reinterpret_cast(data); + const int is_null = ArrowNpyString_load(allocator, packed, &value); + if (is_null == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (is_null) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); + } + data += stride_; + } + + return Status::OK(); +} + +Status NumPyConverter::ConvertStringDType() { + util::InitializeUTF8(); + + if (type_ == nullptr) { + type_ = utf8(); + } + + switch (type_->id()) { + case Type::STRING: { + arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + + ArrayVector chunks; + RETURN_NOT_OK(builder.Finish(&chunks)); + for (const auto& chunk : chunks) { + RETURN_NOT_OK(PushArray(chunk->data())); + } + return Status::OK(); + } + case Type::LARGE_STRING: { + LargeStringBuilder builder(pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + return PushBuilderResult(&builder); + } + case Type::STRING_VIEW: { + StringViewBuilder builder(pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + return PushBuilderResult(&builder); + } + default: + return Status::TypeError( + "NumPy StringDType can only be converted to Arrow string types"); + } +} +#endif + Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; std::vector sub_arrays; diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9136f252980..a2bd1edd114 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -17,10 +17,35 @@ # under the License. from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New +from pyarrow.includes.libarrow_python cimport HasNumPyStringDType, StringConversionMode import warnings from cython import sizeof + +cdef inline StringConversionMode _resolve_table_string_conversion_mode(object string_dtype): + if string_dtype is True: + return StringConversionMode_STRING_DTYPE + if string_dtype is False: + return StringConversionMode_PYTHON_OBJECT + + if string_dtype is None: + return StringConversionMode_PYTHON_OBJECT + + if isinstance(string_dtype, str): + option = string_dtype.lower() + if option == "auto": + return StringConversionMode_PYTHON_OBJECT + if option in ("numpy", "string", "stringdtype"): + return StringConversionMode_STRING_DTYPE + if option in ("python", "object"): + return StringConversionMode_PYTHON_OBJECT + + raise ValueError( + "string_dtype must be one of 'auto', 'numpy', 'python', 'object', " + "True or False" + ) + cdef class ChunkedArray(_PandasConvertible): """ An array-like composed from a (possibly empty) collection of pyarrow.Arrays @@ -491,7 +516,7 @@ cdef class ChunkedArray(_PandasConvertible): self._assert_cpu() return _array_like_to_pandas(self, options, types_mapper=types_mapper) - def to_numpy(self, zero_copy_only=False): + def to_numpy(self, zero_copy_only=False, *, string_dtype="auto"): """ Return a NumPy copy of this array (experimental). @@ -500,6 +525,13 @@ cdef class ChunkedArray(_PandasConvertible): zero_copy_only : bool, default False Introduced for signature consistence with pyarrow.Array.to_numpy. This must be False here since NumPy arrays' buffer must be contiguous. + string_dtype : {"auto", "numpy", "python", "object", True, False}, default "auto" + Controls how string-like arrays are converted when NumPy 2.0's + :class:`~numpy.typing.StringDType` is available. ``"numpy"`` or + ``True`` will request StringDType (copying), ``"python"``/``"object"`` + or ``False`` will force Python object dtype. ``"auto"`` preserves the + default object dtype unless StringDType is explicitly requested. + Converting to NumPy's StringDType always copies string data. Returns ------- @@ -526,6 +558,11 @@ cdef class ChunkedArray(_PandasConvertible): object values c_options.to_numpy = True + c_options.string_conversion_mode = _resolve_table_string_conversion_mode( + string_dtype) + if c_options.string_conversion_mode == StringConversionMode_STRING_DTYPE: + if not HasNumPyStringDType(): + raise NotImplementedError("NumPy StringDType not available") with nogil: check_status( diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..7344b9b5b5d 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2331,6 +2331,38 @@ def test_to_numpy_roundtrip(): np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) +@pytest.mark.numpy +@pytest.mark.parametrize( + "arrow_type", + [pa.string(), pa.large_string(), pa.string_view()], +) +@pytest.mark.parametrize("scenario", ["no_nulls", "with_nulls", "sliced", "empty"]) +def test_to_numpy_stringdtype(arrow_type, scenario): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + values = { + "no_nulls": ["a", "b", "c"], + "with_nulls": ["a", None, "c"], + "sliced": ["z", "a", None, "c", "q"], + "empty": [], + } + + arr = pa.array(values[scenario], type=arrow_type) + if scenario == "sliced": + arr = arr.slice(1, 3) + + result = arr.to_numpy(zero_copy_only=False, string_dtype="numpy") + + assert result.dtype == np.dtype(StringDType()) + assert result.tolist() == arr.to_pylist() + + @pytest.mark.numpy def test_array_uint64_from_py_over_range(): arr = pa.array([2 ** 63], type=pa.uint64()) @@ -2758,6 +2790,119 @@ def test_array_from_numpy_unicode(string_type): assert arrow_arr.equals(expected) +@pytest.mark.numpy +def test_array_from_numpy_string_dtype(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + dtype = StringDType() + + arr = np.array(["some", "strings"], dtype=dtype) + + arrow_arr = pa.array(arr) + + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arrow_arr = pa.array(arr, type=pa.string()) + assert arrow_arr.type == pa.string() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arrow_arr = pa.array(arr, type=pa.large_string()) + assert arrow_arr.type == pa.large_string() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arrow_arr = pa.array(arr, type=pa.string_view()) + assert arrow_arr.type == pa.string_view() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arr_full = np.array(["a", "b", "c", "d", "e"], dtype=dtype) + arr = arr_full[::2] + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["a", "c", "e"] + + +@pytest.mark.numpy +def test_numpy_stringdtype_thresholds_and_unicode(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + dtype = StringDType() + + short = "hello" + medium = "a" * 100 + long_ = "b" * 300 + unicode_ = "árvíztűrő tükörfúrógép 🥐 你好" + long_unicode = "🥐" * 200 + + arr = np.array([short, medium, long_, unicode_, long_unicode], dtype=dtype) + assert pa.array(arr).to_pylist() == [short, medium, long_, unicode_, long_unicode] + + +@pytest.mark.numpy +def test_array_from_numpy_string_dtype_nulls_and_mask(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + # Real StringDType, use its NA sentinel + dtype = StringDType(na_object=None) + arr = np.array(["this array has", None, "as an entry"], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] + + # Test interplay of NA sentinel and an explicit mask: + # - index 1 is null because of na_object / Python None + # - index 2 is forced null by the mask + mask = np.array([False, False, True], dtype=bool) + arrow_arr = pa.array(arr, mask=mask) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.null_count == 2 + assert arrow_arr.to_pylist() == ["this array has", None, None] + + mask = np.array([True, False, True], dtype=bool) + assert pa.array(arr, mask=mask).to_pylist() == [None, None, None] + + +@pytest.mark.numpy +def test_array_from_numpy_string_dtype_string_sentinel_and_mask(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + sentinel = "__placeholder__" + dtype = StringDType(na_object=sentinel) + arr = np.array(["this array has", sentinel, "as an entry"], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] + + mask = np.array([False, False, True], dtype=bool) + assert pa.array(arr, mask=mask).to_pylist() == ["this array has", None, None] + + @pytest.mark.numpy def test_array_string_from_non_string(): # ARROW-5682 - when converting to string raise on non string-like dtype