Skip to content

Consider supporting all the different lancedb table column formats. #6

@ayao227

Description

@ayao227

Consider serialize all the different column format using this utility and move the list/vector handling in serialize_arrow_value to a separate method?

This code specifically encodes binary data using base64. Below calls it convert_bytes, but can rename it to
serialize_object or serialize_value?

import base64
from datetime import date, datetime, time, timedelta

import numpy as np
import pyarrow as pa


def _convert_temporal(obj):
    """Convert temporal types to string representation."""
    if isinstance(obj, (datetime, date, time)):
        return obj.isoformat()
    if isinstance(obj, timedelta):
        return obj.total_seconds()
    return str(obj)


def _convert_pyarrow_scalar(obj):
    """Convert PyArrow scalar types to JSON-serializable format."""
    if pa.types.is_binary(obj.type):
        return base64.b64encode(obj.as_py()).decode("utf-8")

    if pa.types.is_temporal(obj.type):
        return _convert_temporal(obj.as_py())

    if pa.types.is_list(obj.type) or pa.types.is_map(obj.type):
        return [convert_bytes(item) for item in obj.as_py()]

    if pa.types.is_struct(obj.type):
        return {
            field.name: convert_bytes(obj.field(field.name).as_py())
            for field in obj.type
        }

    if pa.types.is_floating(obj.type):
        return float(obj.as_py())

    return obj.as_py()


def _convert_container(obj):
    """Convert container types (dict, list, tuple) recursively."""
    if isinstance(obj, dict):
        return {key: convert_bytes(value) for key, value in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [convert_bytes(item) for item in obj]
    return obj


def _convert_basic_types(obj):
    """Convert basic Python types to JSON-serializable format."""
    if isinstance(obj, (bytes, pa.BinaryScalar)):
        return base64.b64encode(obj).decode("utf-8")
    if isinstance(obj, (datetime, date, time)):
        return obj.isoformat()
    if isinstance(obj, timedelta):
        return obj.total_seconds()
    if isinstance(obj, np.number):
        return obj.item()
    return obj


def convert_bytes(obj):
    """
    Recursively convert objects to JSON-serializable format.

    Handles:
    - bytes/PyArrow binary: Base64-encoded string
    - datetime types: ISO format string
    - PyArrow types: Python native types
    - nested types: recursive conversion
    """
    # First try basic type conversions
    result = _convert_basic_types(obj)
    if result is not obj:
        return result

    # Then try container types
    result = _convert_container(obj)
    if result is not obj:
        return result

    # Finally try PyArrow scalar types
    if isinstance(obj, pa.Scalar):
        return _convert_pyarrow_scalar(obj)

    return obj

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions