-
-
Notifications
You must be signed in to change notification settings - Fork 2
Open
Description
Consider serialize all the different column format using this utility and move the list/vector handling in serialize_arrow_value to a separate method?
This code specifically encodes binary data using base64. Below calls it convert_bytes, but can rename it to
serialize_object or serialize_value?
import base64
from datetime import date, datetime, time, timedelta
import numpy as np
import pyarrow as pa
def _convert_temporal(obj):
"""Convert temporal types to string representation."""
if isinstance(obj, (datetime, date, time)):
return obj.isoformat()
if isinstance(obj, timedelta):
return obj.total_seconds()
return str(obj)
def _convert_pyarrow_scalar(obj):
"""Convert PyArrow scalar types to JSON-serializable format."""
if pa.types.is_binary(obj.type):
return base64.b64encode(obj.as_py()).decode("utf-8")
if pa.types.is_temporal(obj.type):
return _convert_temporal(obj.as_py())
if pa.types.is_list(obj.type) or pa.types.is_map(obj.type):
return [convert_bytes(item) for item in obj.as_py()]
if pa.types.is_struct(obj.type):
return {
field.name: convert_bytes(obj.field(field.name).as_py())
for field in obj.type
}
if pa.types.is_floating(obj.type):
return float(obj.as_py())
return obj.as_py()
def _convert_container(obj):
"""Convert container types (dict, list, tuple) recursively."""
if isinstance(obj, dict):
return {key: convert_bytes(value) for key, value in obj.items()}
if isinstance(obj, (list, tuple)):
return [convert_bytes(item) for item in obj]
return obj
def _convert_basic_types(obj):
"""Convert basic Python types to JSON-serializable format."""
if isinstance(obj, (bytes, pa.BinaryScalar)):
return base64.b64encode(obj).decode("utf-8")
if isinstance(obj, (datetime, date, time)):
return obj.isoformat()
if isinstance(obj, timedelta):
return obj.total_seconds()
if isinstance(obj, np.number):
return obj.item()
return obj
def convert_bytes(obj):
"""
Recursively convert objects to JSON-serializable format.
Handles:
- bytes/PyArrow binary: Base64-encoded string
- datetime types: ISO format string
- PyArrow types: Python native types
- nested types: recursive conversion
"""
# First try basic type conversions
result = _convert_basic_types(obj)
if result is not obj:
return result
# Then try container types
result = _convert_container(obj)
if result is not obj:
return result
# Finally try PyArrow scalar types
if isinstance(obj, pa.Scalar):
return _convert_pyarrow_scalar(obj)
return obj
Metadata
Metadata
Assignees
Labels
No labels