Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

from cpython.pycapsule cimport PyCapsule_CheckExact, PyCapsule_GetPointer, PyCapsule_New
from pyarrow.includes.libarrow_python cimport HasNumPyStringDType, StringConversionMode

from collections.abc import Sequence
import os
Expand Down Expand Up @@ -65,6 +66,30 @@ def _ndarray_to_arrow_type(object values, DataType type):
return pyarrow_wrap_data_type(_ndarray_to_type(values, type))


cdef inline StringConversionMode _resolve_string_conversion_mode(object string_dtype):
if string_dtype is True:
return StringConversionMode.STRING_DTYPE
if string_dtype is False:
return StringConversionMode.PYTHON_OBJECT

if string_dtype is None:
return StringConversionMode.PYTHON_OBJECT

if isinstance(string_dtype, str):
option = string_dtype.lower()
if option == "auto":
return StringConversionMode.PYTHON_OBJECT
if option in ("numpy", "string", "stringdtype"):
return StringConversionMode.STRING_DTYPE
if option in ("python", "object"):
return StringConversionMode.PYTHON_OBJECT

raise ValueError(
"string_dtype must be one of 'auto', 'numpy', 'python', 'object', "
"True or False"
)


cdef shared_ptr[CDataType] _ndarray_to_type(object values,
DataType type) except *:
cdef shared_ptr[CDataType] c_type
Expand Down Expand Up @@ -1734,7 +1759,7 @@ cdef class Array(_PandasConvertible):
return values
return np.asarray(values, dtype=dtype)

def to_numpy(self, zero_copy_only=True, writable=False):
def to_numpy(self, zero_copy_only=True, writable=False, *, string_dtype="auto"):
"""
Return a NumPy view or copy of this array.

Expand All @@ -1757,6 +1782,14 @@ cdef class Array(_PandasConvertible):
By setting this to True, a copy of the array is made to ensure
it is writable.

string_dtype : {"auto", "numpy", "python", "object", True, False}, default "auto"
Controls how string-like arrays are converted when NumPy 2.0's
:class:`~numpy.typing.StringDType` is available. ``"numpy"`` or
``True`` will request StringDType (copying), ``"python"``/``"object"``
or ``False`` will force Python object dtype. ``"auto"`` preserves the
default object dtype unless StringDType is explicitly requested.
Converting to NumPy's StringDType always copies string data.

Returns
-------
array : numpy.ndarray
Expand All @@ -1775,6 +1808,11 @@ cdef class Array(_PandasConvertible):
raise ValueError(
"Cannot return a writable array if asking for zero-copy")

c_options.string_conversion_mode = _resolve_string_conversion_mode(string_dtype)
if c_options.string_conversion_mode == StringConversionMode.STRING_DTYPE:
if not HasNumPyStringDType():
raise NotImplementedError("NumPy StringDType not available")

# If there are nulls and the array is a DictionaryArray
# decoding the dictionary will make sure nulls are correctly handled.
# Decoding a dictionary does imply a copy by the way,
Expand Down
9 changes: 9 additions & 0 deletions python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
shared_ptr[CTable] table,
PyObject** out)

c_bool HasNumPyStringDType()

void c_set_default_memory_pool \
" arrow::py::set_default_memory_pool"(CMemoryPool* pool)\

Expand All @@ -183,6 +185,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
PyOutputStream(object fo)

cdef cppclass PandasOptions:
cdef enum StringConversionMode:
AUTO
STRING_DTYPE
PYTHON_OBJECT
CMemoryPool* pool
c_bool strings_to_categorical
c_bool zero_copy_only
Expand All @@ -201,6 +207,9 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
shared_ptr[const unordered_set[c_string]] categorical_columns
shared_ptr[const unordered_set[c_string]] extension_columns
c_bool to_numpy
StringConversionMode string_conversion_mode

ctypedef PandasOptions.StringConversionMode StringConversionMode


cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:
Expand Down
241 changes: 239 additions & 2 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@
#include <vector>

#include "arrow/array.h"
#include "arrow/array/array_binary.h"
#include "arrow/buffer.h"
#include "arrow/datum.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_run_reader.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/hashing.h"
#include "arrow/util/int_util.h"
Expand Down Expand Up @@ -68,6 +70,16 @@ using internal::CheckIndexBounds;
using internal::OptionalParallelFor;

namespace py {

ARROW_PYTHON_EXPORT bool HasNumPyStringDType() {
#if NPY_ABI_VERSION >= 0x02000000
auto* dtype_table = reinterpret_cast<PyArray_DTypeMeta**>(PyArray_API + 320);
return dtype_table[39] != nullptr;
#else
return false;
#endif
}

namespace {

// Fix options for conversion of an inner (child) array.
Expand Down Expand Up @@ -344,6 +356,7 @@ class PandasWriter {
public:
enum type {
OBJECT,
STRING_DTYPE,
UINT8,
INT8,
UINT16,
Expand Down Expand Up @@ -1405,6 +1418,213 @@ class ObjectWriter : public TypedPandasWriter<NPY_OBJECT> {
}
};

#if NPY_ABI_VERSION >= 0x02000000
inline npy_string_allocator* ArrowNpyString_acquire_allocator(
const PyArray_StringDTypeObject* descr) {
using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*);
return reinterpret_cast<Func>(PyArray_API[316])(descr);
}

inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) {
using Func = void (*)(npy_string_allocator*);
reinterpret_cast<Func>(PyArray_API[318])(allocator);
}

inline int ArrowNpyString_pack(npy_string_allocator* allocator,
npy_packed_static_string* packed, const char* data,
size_t length) {
using Func =
int (*)(npy_string_allocator*, npy_packed_static_string*, const char*, size_t);
return reinterpret_cast<Func>(PyArray_API[314])(allocator, packed, data, length);
}

inline int ArrowNpyString_pack_null(npy_string_allocator* allocator,
npy_packed_static_string* packed) {
using Func = int (*)(npy_string_allocator*, npy_packed_static_string*);
return reinterpret_cast<Func>(PyArray_API[315])(allocator, packed);
}

Status PackStringValue(npy_string_allocator* allocator, npy_packed_static_string* packed,
const std::string_view& view) {
const int result = ArrowNpyString_pack(allocator, packed, view.data(), view.size());
if (result == -1) {
RETURN_IF_PYERROR();
return Status::Invalid("Failed to pack NumPy StringDType value");
}
return Status::OK();
}

Status PackNullString(npy_string_allocator* allocator, npy_packed_static_string* packed) {
const int result = ArrowNpyString_pack_null(allocator, packed);
if (result == -1) {
RETURN_IF_PYERROR();
return Status::Invalid("Failed to pack NumPy StringDType value");
}
return Status::OK();
}

template <typename ArrayType>
Status WriteOffsetStringValues(const ArrayType& arr, npy_string_allocator* allocator,
char* data, npy_intp stride) {
using offset_type = typename ArrayType::offset_type;

const offset_type* offsets = arr.raw_value_offsets();
const auto base_offset = offsets[0];
const uint8_t* value_data = arr.value_data()->data();
const uint8_t* validity = arr.null_bitmap_data();

auto pack_values = [&](int64_t position, int64_t length) -> Status {
for (int64_t i = 0; i < length; ++i) {
const auto start = static_cast<int64_t>(offsets[position + i] - base_offset);
const auto end = static_cast<int64_t>(offsets[position + i + 1] - base_offset);
auto* packed =
reinterpret_cast<npy_packed_static_string*>(data + (position + i) * stride);
RETURN_NOT_OK(PackStringValue(
allocator, packed,
std::string_view(reinterpret_cast<const char*>(value_data + start),
end - start)));
}
return Status::OK();
};

auto pack_nulls = [&](int64_t position, int64_t length) -> Status {
for (int64_t i = 0; i < length; ++i) {
auto* packed =
reinterpret_cast<npy_packed_static_string*>(data + (position + i) * stride);
RETURN_NOT_OK(PackNullString(allocator, packed));
}
return Status::OK();
};

if (arr.null_count() == 0) {
return pack_values(/*position=*/0, arr.length());
}

arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length());
int64_t position = 0;
auto run = reader.NextRun();
while (run.length > 0) {
if (run.set) {
RETURN_NOT_OK(pack_values(position, run.length));
} else {
RETURN_NOT_OK(pack_nulls(position, run.length));
}
position += run.length;
run = reader.NextRun();
}

return Status::OK();
}

template <typename ArrayType>
Status WriteViewStringValues(const ArrayType& arr, npy_string_allocator* allocator,
char* data, npy_intp stride) {
const uint8_t* validity = arr.null_bitmap_data();

auto pack_values = [&](int64_t position, int64_t length) -> Status {
for (int64_t i = 0; i < length; ++i) {
auto* packed =
reinterpret_cast<npy_packed_static_string*>(data + (position + i) * stride);
const auto view = arr.GetView(position + i);
RETURN_NOT_OK(PackStringValue(allocator, packed, view));
}
return Status::OK();
};

auto pack_nulls = [&](int64_t position, int64_t length) -> Status {
for (int64_t i = 0; i < length; ++i) {
auto* packed =
reinterpret_cast<npy_packed_static_string*>(data + (position + i) * stride);
RETURN_NOT_OK(PackNullString(allocator, packed));
}
return Status::OK();
};

if (arr.null_count() == 0) {
return pack_values(/*position=*/0, arr.length());
}

arrow::internal::BitRunReader reader(validity, arr.offset(), arr.length());
int64_t position = 0;
auto run = reader.NextRun();
while (run.length > 0) {
if (run.set) {
RETURN_NOT_OK(pack_values(position, run.length));
} else {
RETURN_NOT_OK(pack_nulls(position, run.length));
}
position += run.length;
run = reader.NextRun();
}

return Status::OK();
}

class StringDTypeWriter : public PandasWriter {
public:
using PandasWriter::PandasWriter;

Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
ARROW_UNUSED(py_ref);
RETURN_NOT_OK(CheckNotZeroCopyOnly(*data));
RETURN_NOT_OK(EnsureAllocated());
return CopyInto(std::move(data), /*rel_placement=*/0);
}

Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
RETURN_NOT_OK(CheckNotZeroCopyOnly(*data));

PyAcquireGIL lock;
auto* np_arr = reinterpret_cast<PyArrayObject*>(block_arr_.obj());
auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(PyArray_DESCR(np_arr));

npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);
if (allocator == nullptr) {
return Status::Invalid("Failed to acquire NumPy StringDType allocator");
}
struct AllocatorGuard {
npy_string_allocator* allocator;
explicit AllocatorGuard(npy_string_allocator* alloc) : allocator(alloc) {}
~AllocatorGuard() { ArrowNpyString_release_allocator(allocator); }
} guard(allocator);

const npy_intp row_stride = PyArray_STRIDES(np_arr)[1];
char* data_start = PyArray_BYTES(np_arr) + rel_placement * PyArray_STRIDES(np_arr)[0];
int64_t offset = 0;

for (const auto& chunk : data->chunks()) {
char* chunk_data = data_start + offset * row_stride;
switch (data->type()->id()) {
case Type::STRING: {
const auto& arr = checked_cast<const StringArray&>(*chunk);
RETURN_NOT_OK(WriteOffsetStringValues(arr, allocator, chunk_data, row_stride));
break;
}
case Type::LARGE_STRING: {
const auto& arr = checked_cast<const LargeStringArray&>(*chunk);
RETURN_NOT_OK(WriteOffsetStringValues(arr, allocator, chunk_data, row_stride));
break;
}
case Type::STRING_VIEW: {
const auto& arr = checked_cast<const StringViewArray&>(*chunk);
RETURN_NOT_OK(WriteViewStringValues(arr, allocator, chunk_data, row_stride));
break;
}
default:
return Status::TypeError("Expected an Arrow string array, got ",
data->type()->ToString());
}
offset += chunk->length();
}

return Status::OK();
}

protected:
Status Allocate() override { return AllocateNDArray(NPY_VSTRING); }
};
#endif

static inline bool IsNonNullContiguous(const ChunkedArray& data) {
return data.num_chunks() == 1 && data.null_count() == 0;
}
Expand Down Expand Up @@ -2056,6 +2276,11 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
case PandasWriter::EXTENSION:
*writer = std::make_shared<ExtensionWriter>(options, num_rows, num_columns);
break;
#if NPY_ABI_VERSION >= 0x02000000
case PandasWriter::STRING_DTYPE:
*writer = std::make_shared<StringDTypeWriter>(options, num_rows, num_columns);
break;
#endif
BLOCK_CASE(OBJECT, ObjectWriter);
BLOCK_CASE(UINT8, UInt8Writer);
BLOCK_CASE(INT8, Int8Writer);
Expand Down Expand Up @@ -2132,8 +2357,20 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
break;
case Type::STRING: // fall through
case Type::LARGE_STRING: // fall through
case Type::STRING_VIEW: // fall through
case Type::BINARY: // fall through
case Type::STRING_VIEW: { // fall through
#if NPY_ABI_VERSION >= 0x02000000
if (options.to_numpy && options.string_conversion_mode ==
PandasOptions::StringConversionMode::STRING_DTYPE) {
// NumPy's StringDType allocator always copies string data, so zero-copy
// requests must continue to route through the object-dtype path.
*output_type = PandasWriter::STRING_DTYPE;
break;
}
#endif
*output_type = PandasWriter::OBJECT;
break;
}
case Type::BINARY: // fall through
case Type::LARGE_BINARY:
case Type::BINARY_VIEW:
case Type::NA: // fall through
Expand Down
Loading
Loading