apache · alippai · Dec 7, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
@@ -16,6 +16,7 @@
 # under the License.
 
 from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New
+from pyarrow.includes.libarrow_python cimport HasNumPyStringDType, StringConversionMode
 
 from collections.abc import Sequence
 import os
@@ -65,6 +66,30 @@ def _ndarray_to_arrow_type(object values, DataType type):
     return pyarrow_wrap_data_type(_ndarray_to_type(values, type))
 
 
+cdef inline StringConversionMode _resolve_string_conversion_mode(object string_dtype):
+    if string_dtype is True:
+        return StringConversionMode.STRING_DTYPE
+    if string_dtype is False:
+        return StringConversionMode.PYTHON_OBJECT
+
+    if string_dtype is None:
+        return StringConversionMode.PYTHON_OBJECT
+
+    if isinstance(string_dtype, str):
+        option = string_dtype.lower()
+        if option == "auto":
+            return StringConversionMode.PYTHON_OBJECT
+        if option in ("numpy", "string", "stringdtype"):
+            return StringConversionMode.STRING_DTYPE
+        if option in ("python", "object"):
+            return StringConversionMode.PYTHON_OBJECT
+
+    raise ValueError(
+        "string_dtype must be one of 'auto', 'numpy', 'python', 'object', "
+        "True or False"
+    )
+
+
 cdef shared_ptr[CDataType] _ndarray_to_type(object values,
                                             DataType type) except *:
     cdef shared_ptr[CDataType] c_type
@@ -1734,7 +1759,7 @@ cdef class Array(_PandasConvertible):
             return values
         return np.asarray(values, dtype=dtype)
 
-    def to_numpy(self, zero_copy_only=True, writable=False):
+    def to_numpy(self, zero_copy_only=True, writable=False, *, string_dtype="auto"):
         """
         Return a NumPy view or copy of this array.
 
@@ -1757,6 +1782,14 @@ cdef class Array(_PandasConvertible):
             By setting this to True, a copy of the array is made to ensure
             it is writable.
 
+        string_dtype : {"auto", "numpy", "python", "object", True, False}, default "auto"
+            Controls how string-like arrays are converted when NumPy 2.0's
+            :class:`~numpy.typing.StringDType` is available. ``"numpy"`` or
+            ``True`` will request StringDType (copying), ``"python"``/``"object"``
+            or ``False`` will force Python object dtype. ``"auto"`` preserves the
+            default object dtype unless StringDType is explicitly requested.
+            Converting to NumPy's StringDType always copies string data.
+
         Returns
         -------
         array : numpy.ndarray
@@ -1775,6 +1808,11 @@ cdef class Array(_PandasConvertible):
             raise ValueError(
                 "Cannot return a writable array if asking for zero-copy")
 
+        c_options.string_conversion_mode = _resolve_string_conversion_mode(string_dtype)
+        if c_options.string_conversion_mode == StringConversionMode.STRING_DTYPE:
+            if not HasNumPyStringDType():
+                raise NotImplementedError("NumPy StringDType not available")
+
         # If there are nulls and the array is a DictionaryArray
         # decoding the dictionary will make sure nulls are correctly handled.
         # Decoding a dictionary does imply a copy by the way,

@@ -161,6 +161,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
                                  shared_ptr[CTable] table,
                                  PyObject** out)
 
+    c_bool HasNumPyStringDType()
+
     void c_set_default_memory_pool \
         " arrow::py::set_default_memory_pool"(CMemoryPool* pool)\
 
@@ -183,6 +185,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
         PyOutputStream(object fo)
 
     cdef cppclass PandasOptions:
+        cdef enum StringConversionMode:
+            AUTO
+            STRING_DTYPE
+            PYTHON_OBJECT
         CMemoryPool* pool
         c_bool strings_to_categorical
         c_bool zero_copy_only
@@ -201,6 +207,9 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
         shared_ptr[const unordered_set[c_string]] categorical_columns
         shared_ptr[const unordered_set[c_string]] extension_columns
         c_bool to_numpy
+        StringConversionMode string_conversion_mode
+
+ctypedef PandasOptions.StringConversionMode StringConversionMode
 
 
 cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:

@@ -32,12 +32,14 @@
 #include <vector>
 
 #include "arrow/array.h"
+#include "arrow/array/array_binary.h"
 #include "arrow/buffer.h"
 #include "arrow/datum.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/hashing.h"
 #include "arrow/util/int_util.h"
@@ -68,6 +70,16 @@ using internal::CheckIndexBounds;
 using internal::OptionalParallelFor;
 
 namespace py {
+
+ARROW_PYTHON_EXPORT bool HasNumPyStringDType() {
+#if NPY_ABI_VERSION >= 0x02000000
+  auto* dtype_table = reinterpret_cast<PyArray_DTypeMeta**>(PyArray_API + 320);
+  return dtype_table[39] != nullptr;
+#else
+  return false;
+#endif
+}
+
 namespace {
 
 // Fix options for conversion of an inner (child) array.
@@ -344,6 +356,7 @@ class PandasWriter {
  public:
   enum type {
     OBJECT,
+    STRING_DTYPE,
     UINT8,
     INT8,
     UINT16,
@@ -1405,6 +1418,213 @@ class ObjectWriter : public TypedPandasWriter<NPY_OBJECT> {
   }
 };
 
+#if NPY_ABI_VERSION >= 0x02000000
+inline npy_string_allocator* ArrowNpyString_acquire_allocator(
+    const PyArray_StringDTypeObject* descr) {
+  using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*);
+  return reinterpret_cast<Func>(PyArray_API[316])(descr);
+}
+
+inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) {
+  using Func = void (*)(npy_string_allocator*);
+  reinterpret_cast<Func>(PyArray_API[318])(allocator);
+}
+
+inline int ArrowNpyString_pack(npy_string_allocator* allocator,
+                               npy_packed_static_string* packed, const char* data,
+                               size_t length) {
+  using Func =
+      int (*)(npy_string_allocator*, npy_packed_static_string*, const char*, size_t);
+  return reinterpret_cast<Func>(PyArray_API[314])(allocator, packed, data, length);
+}
+
+inline int ArrowNpyString_pack_null(npy_string_allocator* allocator,
+                                    npy_packed_static_string* packed) {
+  using Func = int (*)(npy_string_allocator*, npy_packed_static_string*);
+  return reinterpret_cast<Func>(PyArray_API[315])(allocator, packed);
+}
+
+Status PackStringValue(npy_string_allocator* allocator, npy_packed_static_string* packed,
+                       const std::string_view& view) {
+  const int result = ArrowNpyString_pack(allocator, packed, view.data(), view.size());
+  if (result == -1) {
+    RETURN_IF_PYERROR();
+    return Status::Invalid("Failed to pack NumPy StringDType value");
+  }
+  return Status::OK();
+}
+
+Status PackNullString(npy_string_allocator* allocator, npy_packed_static_string* packed) {
+  const int result = ArrowNpyString_pack_null(allocator, packed);
+  if (result == -1) {
+    RETURN_IF_PYERROR();
+    return Status::Invalid("Failed to pack NumPy StringDType value");
+  }
+  return Status::OK();
+}
+
+template <typename ArrayType>
+Status WriteOffsetStringValues(const ArrayType& arr, npy_string_allocator* allocator,
+                               char* data, npy_intp stride) {
+  using offset_type = typename ArrayType::offset_type;
+
+  const offset_type* offsets = arr.raw_value_offsets();
+  const auto base_offset = offsets[0];
+  const uint8_t* value_data = arr.value_data()->data();
+  const uint8_t* validity = arr.null_bitmap_data();
+
+  auto pack_values = [&](int64_t position, int64_t length) -> Status {
+    for (int64_t i = 0; i < length; ++i) {
+      const auto start = static_cast<int64_t>(offsets[position + i] - base_offset);
+      const auto end = static_cast<int64_t>(offsets[position + i + 1] - base_offset);
+      auto* packed =
+          reinterpret_cast<npy_packed_static_string*>(data + (position + i) * stride);
+      RETURN_NOT_OK(PackStringValue(
+          allocator, packed,
+          std::string_view(reinterpret_cast<const char*>(value_data + start),
+                           end - start)));
+    }
+    return Status::OK();
+  };
+
+  auto pack_nulls = [&](int64_t position, int64_t length) -> Status {
+    for (int64_t i = 0; i < length; ++i) {
+      auto* packed =
+          reinterpret_cast<npy_packed_static_string*>(data + (position + i) * stride);
+      RETURN_NOT_OK(PackNullString(allocator, packed));
+    }
+    return Status::OK();
+  };
+
+  if (arr.null_count() == 0) {
+    return pack_values(/*position=*/0, arr.length());
+  }
+
+  arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length());
+  int64_t position = 0;
+  auto run = reader.NextRun();
+  while (run.length > 0) {
+    if (run.set) {
+      RETURN_NOT_OK(pack_values(position, run.length));
+    } else {
+      RETURN_NOT_OK(pack_nulls(position, run.length));
+    }
+    position += run.length;
+    run = reader.NextRun();
+  }
+
+  return Status::OK();
+}
+
+template <typename ArrayType>
+Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocator,
+                             char* data, npy_intp stride) {
+  const uint8_t* validity = arr.null_bitmap_data();
+
+  auto pack_values = [&](int64_t position, int64_t length) -> Status {
+    for (int64_t i = 0; i < length; ++i) {
+      auto* packed =
+          reinterpret_cast<npy_packed_static_string*>(data + (position + i) * stride);
+      const auto view = arr.GetView(position + i);
+      RETURN_NOT_OK(PackStringValue(allocator, packed, view));
+    }
+    return Status::OK();
+  };
+
+  auto pack_nulls = [&](int64_t position, int64_t length) -> Status {
+    for (int64_t i = 0; i < length; ++i) {
+      auto* packed =
+          reinterpret_cast<npy_packed_static_string*>(data + (position + i) * stride);
+      RETURN_NOT_OK(PackNullString(allocator, packed));
+    }
+    return Status::OK();
+  };
+
+  if (arr.null_count() == 0) {
+    return pack_values(/*position=*/0, arr.length());
+  }
+
+  arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length());
+  int64_t position = 0;
+  auto run = reader.NextRun();
+  while (run.length > 0) {
+    if (run.set) {
+      RETURN_NOT_OK(pack_values(position, run.length));
+    } else {
+      RETURN_NOT_OK(pack_nulls(position, run.length));
+    }
+    position += run.length;
+    run = reader.NextRun();
+  }
+
+  return Status::OK();
+}
+
+class StringDTypeWriter : public PandasWriter {
+ public:
+  using PandasWriter::PandasWriter;
+
+  Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+    ARROW_UNUSED(py_ref);
+    RETURN_NOT_OK(CheckNotZeroCopyOnly(*data));
+    RETURN_NOT_OK(EnsureAllocated());
+    return CopyInto(std::move(data), /*rel_placement=*/0);
+  }
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    RETURN_NOT_OK(CheckNotZeroCopyOnly(*data));
+
+    PyAcquireGIL lock;
+    auto* np_arr = reinterpret_cast<PyArrayObject*>(block_arr_.obj());
+    auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(PyArray_DESCR(np_arr));
+
+    npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
+    if (allocator == nullptr) {
+      return Status::Invalid("Failed to acquire NumPy StringDType allocator");
+    }
+    struct AllocatorGuard {
+      npy_string_allocator* allocator;
+      explicit AllocatorGuard(npy_string_allocator* alloc) : allocator(alloc) {}
+      ~AllocatorGuard() { ArrowNpyString_release_allocator(allocator); }
+    } guard(allocator);
+
+    const npy_intp row_stride = PyArray_STRIDES(np_arr)[1];
+    char* data_start = PyArray_BYTES(np_arr) + rel_placement * PyArray_STRIDES(np_arr)[0];
+    int64_t offset = 0;
+
+    for (const auto& chunk : data->chunks()) {
+      char* chunk_data = data_start + offset * row_stride;
+      switch (data->type()->id()) {
+        case Type::STRING: {
+          const auto& arr = checked_cast<const StringArray&>(*chunk);
+          RETURN_NOT_OK(WriteOffsetStringValues(arr, allocator, chunk_data, row_stride));
+          break;
+        }
+        case Type::LARGE_STRING: {
+          const auto& arr = checked_cast<const LargeStringArray&>(*chunk);
+          RETURN_NOT_OK(WriteOffsetStringValues(arr, allocator, chunk_data, row_stride));
+          break;
+        }
+        case Type::STRING_VIEW: {
+          const auto& arr = checked_cast<const StringViewArray&>(*chunk);
+          RETURN_NOT_OK(WriteViewStringValues(arr, allocator, chunk_data, row_stride));
+          break;
+        }
+        default:
+          return Status::TypeError("Expected an Arrow string array, got ",
+                                   data->type()->ToString());
+      }
+      offset += chunk->length();
+    }
+
+    return Status::OK();
+  }
+
+ protected:
+  Status Allocate() override { return AllocateNDArray(NPY_VSTRING); }
+};
+#endif
+
 static inline bool IsNonNullContiguous(const ChunkedArray& data) {
   return data.num_chunks() == 1 && data.null_count() == 0;
 }
@@ -2056,6 +2276,11 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
     case PandasWriter::EXTENSION:
       *writer = std::make_shared<ExtensionWriter>(options, num_rows, num_columns);
       break;
+#if NPY_ABI_VERSION >= 0x02000000
+    case PandasWriter::STRING_DTYPE:
+      *writer = std::make_shared<StringDTypeWriter>(options, num_rows, num_columns);
+      break;
+#endif
       BLOCK_CASE(OBJECT, ObjectWriter);
       BLOCK_CASE(UINT8, UInt8Writer);
       BLOCK_CASE(INT8, Int8Writer);
@@ -2132,8 +2357,20 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
       break;
     case Type::STRING:        // fall through
     case Type::LARGE_STRING:  // fall through
-    case Type::STRING_VIEW:   // fall through
-    case Type::BINARY:        // fall through
+    case Type::STRING_VIEW: {  // fall through
+#if NPY_ABI_VERSION >= 0x02000000
+      if (options.to_numpy && options.string_conversion_mode ==
+                                  PandasOptions::StringConversionMode::STRING_DTYPE) {
+        // NumPy's StringDType allocator always copies string data, so zero-copy
+        // requests must continue to route through the object-dtype path.
+        *output_type = PandasWriter::STRING_DTYPE;
+        break;
+      }
+#endif
+      *output_type = PandasWriter::OBJECT;
+      break;
+    }
+    case Type::BINARY:  // fall through
     case Type::LARGE_BINARY:
     case Type::BINARY_VIEW:
     case Type::NA:                       // fall through