Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
repos:
- repo: https://github.com/tsvikas/sync-with-uv
rev: v0.4.0 # replace with the latest version
hooks:
- id: sync-with-uv
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.4
hooks:
- id: ruff-format
types_or: [ python, pyi ]

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
- id: trailing-whitespace
types_or: [ python, pyi ]
- id: end-of-file-fixer
types_or: [ python, pyi ]
- id: check-yaml
- id: check-added-large-files
- id: check-merge-conflict
8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ dependencies = [
"deltalake>=1.0.2",
"graphviz>=0.21",
"gitpython>=3.1.45",
"starfix>=0.1.3",
"pygraphviz>=1.14",
"uuid-utils>=0.11.1",
]
readme = "README.md"
requires-python = ">=3.11.0"
Expand Down Expand Up @@ -54,15 +57,16 @@ dev = [
"ipywidgets>=8.1.7",
"jsonschema>=4.25.0",
"minio>=7.2.16",
"pre-commit>=4.4.0",
"pre-commit-hooks>=6.0.0",
"pyarrow-stubs>=20.0.0.20250716",
"pygraphviz>=1.14",
"pyiceberg>=0.9.1",
"pyright>=1.1.404",
"pytest>=8.3.5",
"pytest-cov>=6.1.1",
"ray[default]==2.48.0",
"redis>=6.2.0",
"ruff>=0.11.11",
"ruff>=0.14.4",
"sphinx>=8.2.3",
"tqdm>=4.67.1",
]
52 changes: 26 additions & 26 deletions src/orcapod/__init__.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
from .config import DEFAULT_CONFIG, Config
from .core import DEFAULT_TRACKER_MANAGER
from .core.pods import function_pod, FunctionPod, CachedPod
from .core import streams
from .core import operators
from .core import sources
from .core.sources import DataFrameSource
from . import databases
from .pipeline import Pipeline
# from .config import DEFAULT_CONFIG, Config
# from .core import DEFAULT_TRACKER_MANAGER
# from .core.packet_function import PythonPacketFunction
# from .core.function_pod import FunctionPod
# from .core import streams
# from .core import operators
# from .core import sources
# from .core.sources import DataFrameSource
# from . import databases
# from .pipeline import Pipeline


# no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking

no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking

__all__ = [
"DEFAULT_CONFIG",
"Config",
"DEFAULT_TRACKER_MANAGER",
"no_tracking",
"function_pod",
"FunctionPod",
"CachedPod",
"streams",
"databases",
"sources",
"DataFrameSource",
"operators",
"Pipeline",
]
# __all__ = [
# "DEFAULT_CONFIG",
# "Config",
# "DEFAULT_TRACKER_MANAGER",
# "no_tracking",
# "function_pod",
# "FunctionPod",
# "CachedPod",
# "streams",
# "databases",
# "sources",
# "DataFrameSource",
# "operators",
# "Pipeline",
# ]
9 changes: 6 additions & 3 deletions src/orcapod/contexts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@
versions = get_available_contexts()
"""

from .core import DataContext, ContextValidationError, ContextResolutionError
from .registry import JSONDataContextRegistry
from typing import Any
from orcapod.protocols import hashing_protocols as hp, semantic_types_protocols as sp

from orcapod.protocols import hashing_protocols as hp
from orcapod.protocols import semantic_types_protocols as sp

from .core import ContextResolutionError, ContextValidationError, DataContext
from .registry import JSONDataContextRegistry

# Global registry instance (lazily initialized)
_registry: JSONDataContextRegistry | None = None
Expand Down
9 changes: 5 additions & 4 deletions src/orcapod/contexts/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from dataclasses import dataclass

from orcapod.protocols import hashing_protocols as hp, semantic_types_protocols as sp
from orcapod.protocols.hashing_protocols import ArrowHasher, ObjectHasher
from orcapod.protocols.semantic_types_protocols import TypeConverter


@dataclass
Expand All @@ -31,9 +32,9 @@ class DataContext:
context_key: str
version: str
description: str
type_converter: sp.TypeConverter
arrow_hasher: hp.ArrowHasher
object_hasher: hp.ObjectHasher
type_converter: TypeConverter
arrow_hasher: ArrowHasher
object_hasher: ObjectHasher # this is the currently the JSON hasher


class ContextValidationError(Exception):
Expand Down
8 changes: 4 additions & 4 deletions src/orcapod/contexts/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
"""

import json


import logging
from pathlib import Path
from typing import Any
import logging

from orcapod.utils.object_spec import parse_objectspec
from .core import DataContext, ContextValidationError, ContextResolutionError

from .core import ContextResolutionError, ContextValidationError, DataContext

logger = logging.getLogger(__name__)

Expand Down
4 changes: 1 addition & 3 deletions src/orcapod/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from .trackers import DEFAULT_TRACKER_MANAGER
from .system_constants import constants
from orcapod.core.tracker import DEFAULT_TRACKER_MANAGER

__all__ = [
"DEFAULT_TRACKER_MANAGER",
"constants",
]
143 changes: 122 additions & 21 deletions src/orcapod/core/base.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,49 @@
from __future__ import annotations

import logging
from abc import ABC
from abc import ABC, abstractmethod
from datetime import datetime, timezone
from typing import Any

from orcapod import DEFAULT_CONFIG, contexts
from orcapod.config import Config
import orcapod.contexts as contexts
from orcapod.config import DEFAULT_CONFIG, Config
from orcapod.protocols import hashing_protocols as hp

logger = logging.getLogger(__name__)


class LablableBase:
# Base classes for Orcapod core components, providing common functionality.


class LabelableMixin:
"""
Mixin class for objects that can have a label. Provides a mechanism to compute a label based on the object's content.
By default, explicitly set label will always take precedence over computed label and inferred label.
"""

def __init__(self, label: str | None = None, **kwargs):
self._label = label
super().__init__(**kwargs)

@property
def has_assigned_label(self) -> bool:
def label(self) -> str:
"""
Check if the label is explicitly set for this object.
Get the label of this object.

Returns:
bool: True if the label is explicitly set, False otherwise.
str | None: The label of the object, or None if not set.
"""
return self._label is not None
return self._label or self.computed_label() or self.__class__.__name__

@property
def label(self) -> str:
def has_assigned_label(self) -> bool:
"""
Get the label of this object.
Check if the label has been explicitly set for this object.

Returns:
str | None: The label of the object, or None if not set.
bool: True if the label is explicitly set, False otherwise.
"""
return self._label or self.computed_label() or self.__class__.__name__
return self._label is not None

@label.setter
def label(self, label: str | None) -> None:
Expand All @@ -52,18 +63,25 @@ def computed_label(self) -> str | None:
return None


class ContextAwareConfigurableBase(ABC):
class DataContextMixin:
"""
Mixin to associate data context and an Orcapod config with an object. Deriving class allows data context and Orcapod config to be
explicitly specified and if not provided, use the default data context and Orcapod config.
"""

def __init__(
self,
data_context: str | contexts.DataContext | None = None,
orcapod_config: Config | None = None,
**kwargs,
):
super().__init__(**kwargs)
self._data_context = contexts.resolve_context(data_context)
if orcapod_config is None:
orcapod_config = DEFAULT_CONFIG
orcapod_config = (
DEFAULT_CONFIG # DEFAULT_CONFIG as defined in orcapod/config.py
)
self._orcapod_config = orcapod_config
self._data_context = contexts.resolve_context(data_context)

@property
def orcapod_config(self) -> Config:
Expand All @@ -73,13 +91,18 @@ def orcapod_config(self) -> Config:
def data_context(self) -> contexts.DataContext:
return self._data_context

# TODO: re-evaluate whether changing data context should be allowed
@data_context.setter
def data_context(self, context: str | contexts.DataContext | None) -> None:
self._data_context = contexts.resolve_context(context)

@property
def data_context_key(self) -> str:
"""Return the data context key."""
return self._data_context.context_key


class ContentIdentifiableBase(ContextAwareConfigurableBase):
class ContentIdentifiableBase(DataContextMixin, ABC):
"""
Base class for content-identifiable objects.
This class provides a way to define objects that can be uniquely identified
Expand All @@ -90,17 +113,22 @@ class ContentIdentifiableBase(ContextAwareConfigurableBase):
Two content-identifiable objects are considered equal if their `identity_structure` returns the same value.
"""

def __init__(self, **kwargs) -> None:
def __init__(
self,
data_context: str | contexts.DataContext | None = None,
orcapod_config: Config | None = None,
) -> None:
"""
Initialize the ContentHashable with an optional ObjectHasher.

Args:
identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing.
"""
super().__init__(**kwargs)
super().__init__(data_context=data_context, orcapod_config=orcapod_config)
self._cached_content_hash: hp.ContentHash | None = None
self._cached_int_hash: int | None = None

@abstractmethod
def identity_structure(self) -> Any:
"""
Return a structure that represents the identity of this object.
Expand All @@ -112,7 +140,7 @@ def identity_structure(self) -> Any:
Returns:
Any: A structure representing this object's content, or None to use default hash
"""
raise NotImplementedError("Subclasses must implement identity_structure")
...

def content_hash(self) -> hp.ContentHash:
"""
Expand Down Expand Up @@ -157,5 +185,78 @@ def __eq__(self, other: object) -> bool:
return self.identity_structure() == other.identity_structure()


class LabeledContentIdentifiableBase(ContentIdentifiableBase, LablableBase):
pass
class TemporalMixin:
"""
Mixin class that adds temporal functionality to an Orcapod entity.
It provides methods to track and manage the last modified timestamp of the entity.
"""

def __init__(self, **kwargs):
super().__init__(**kwargs)
self._modified_time = self._update_modified_time()

@property
def last_modified(self) -> datetime | None:
"""
When this object's content was last modified.

Returns:
datetime: Content last modified timestamp (timezone-aware)
None: Modification time unknown (assume always changed)
"""
return self._modified_time

def _set_modified_time(self, modified_time: datetime | None) -> None:
"""
Set the modified time for this object.

Args:
modified_time (datetime | None): The modified time to set. If None, clears the modified time.
"""
self._modified_time = modified_time

def _update_modified_time(self) -> None:
"""
Update the modified time to the current time.
"""
self._modified_time = datetime.now(timezone.utc)

def updated_since(self, timestamp: datetime) -> bool:
"""
Check if the object has been updated since the given timestamp.

Args:
timestamp (datetime): The timestamp to compare against.

Returns:
bool: True if the object has been updated since the given timestamp, False otherwise.
"""
# if _modified_time is None, consider it always updated
if self._modified_time is None:
return True
return self._modified_time > timestamp


class OrcapodBase(TemporalMixin, LabelableMixin, ContentIdentifiableBase):
"""
Base class for all default OrcaPod entities, providing common functionality
including data context awareness, content-based identity, (semantic) labeling,
and modification timestamp.
"""

def __init__(
self,
label: str | None = None,
data_context: str | contexts.DataContext | None = None,
orcapod_config: Config | None = None,
):
# Init provided here for explicit listing of parmeters
super().__init__(
label=label, data_context=data_context, orcapod_config=orcapod_config
)

def __repr__(self):
return self.__class__.__name__

def __str__(self):
return self.label
Loading
Loading