From c148a502312ee54f1fa44350387c1799f74dc60e Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Tue, 1 Jul 2025 15:40:30 -0700
Subject: [PATCH 01/12] fix logging

---
 mapillary_tools/process_sequence_properties.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index c6ba7643b..9bf60d560 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -423,7 +423,7 @@ def _should_split_by_cutoff_time(
     assert sum(len(s) for s in output_sequences) == sum(len(s) for s in input_sequences)
 
     LOG.info(
-        "Found %s sequences after split by cutoff_time %d seconds",
+        "Found %s sequences after split by cutoff_time %s seconds",
         len(output_sequences),
         cutoff_time,
     )
@@ -462,7 +462,7 @@ def _should_split_by_cutoff_distance(
     assert sum(len(s) for s in output_sequences) == sum(len(s) for s in input_sequences)
 
     LOG.info(
-        "Found %s sequences after split by cutoff_distance %d meters",
+        "Found %s sequences after split by cutoff_distance %s meters",
         len(output_sequences),
         cutoff_distance,
     )

From 61be6b13f3865e6d4c9700c368d64ced1612f1ee Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Wed, 2 Jul 2025 15:38:09 -0700
Subject: [PATCH 02/12] fix splits

---
 .../process_sequence_properties.py            | 390 +++++++++---------
 1 file changed, 191 insertions(+), 199 deletions(-)

diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 9bf60d560..21648def2 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -12,71 +12,38 @@
 LOG = logging.getLogger(__name__)
 
 
-SeqItem = T.TypeVar("SeqItem")
+S = T.TypeVar("S")
+R = T.TypeVar("R")
 PointSequence = T.List[geo.PointLike]
 
 
 def split_sequence_by(
-    sequence: T.Sequence[SeqItem],
-    should_split: T.Callable[[SeqItem, SeqItem], bool],
-) -> list[list[SeqItem]]:
+    sequence: T.Iterable[S], reduce: T.Callable[[R, S], tuple[R, bool]], initial: R
+) -> list[list[S]]:
     """
     Split a sequence into multiple sequences by should_split(prev, cur) => True
     """
-    output_sequences: list[list[SeqItem]] = []
 
-    if sequence:
-        output_sequences.append([sequence[0]])
-
-    for prev, cur in geo.pairwise(sequence):
-        # invariant: prev is processed
-        if should_split(prev, cur):
-            output_sequences.append([cur])
-        else:
-            output_sequences[-1].append(cur)
-        # invariant: cur is processed
-
-    assert sum(len(s) for s in output_sequences) == len(sequence), (
-        output_sequences,
-        sequence,
-    )
-
-    return output_sequences
-
-
-def split_sequence_by_agg(
-    sequence: T.Sequence[SeqItem],
-    should_split_with_sequence_state: T.Callable[[SeqItem, dict], bool],
-) -> list[list[SeqItem]]:
-    """
-    Split a sequence by should_split_with_sequence_state(cur, sequence_state) => True
-    """
-    output_sequences: list[list[SeqItem]] = []
-    sequence_state: dict = {}
-
-    for cur in sequence:
-        start_new_sequence = should_split_with_sequence_state(cur, sequence_state)
+    output_sequences: list[list[S]] = []
 
-        if not output_sequences:
-            output_sequences.append([])
+    value = initial
 
-        if start_new_sequence:
-            # DO NOT reset the state because it contains the information of current item
-            # sequence_state = {}
-            if output_sequences[-1]:
-                output_sequences.append([])
+    for element in sequence:
+        value, should = reduce(value, element)
 
-        output_sequences[-1].append(cur)
-
-    assert sum(len(s) for s in output_sequences) == len(sequence)
+        if should:
+            output_sequences.append([element])
+        else:
+            if output_sequences:
+                output_sequences[-1].append(element)
+            else:
+                output_sequences.append([element])
 
     return output_sequences
 
 
 def duplication_check(
-    sequence: PointSequence,
-    max_duplicate_distance: float,
-    max_duplicate_angle: float,
+    sequence: PointSequence, max_duplicate_distance: float, max_duplicate_angle: float
 ) -> tuple[PointSequence, list[types.ErrorMetadata]]:
     dedups: PointSequence = []
     dups: list[types.ErrorMetadata] = []
@@ -299,11 +266,12 @@ def _check_video_limits(
         else:
             output_video_metadatas.append(video_metadata)
 
-    LOG.info(
-        "Found %s videos and %s errors after video limit checks",
-        len(output_video_metadatas),
-        len(error_metadatas),
-    )
+    if error_metadatas:
+        LOG.info(
+            "Found %s videos and %s errors after video limit checks",
+            len(output_video_metadatas),
+            len(error_metadatas),
+        )
 
     return output_video_metadatas, error_metadatas
 
@@ -360,11 +328,12 @@ def _check_sequences_by_limits(
         len(s) for s in input_sequences
     )
 
-    LOG.info(
-        "Found %s sequences and %s errors after sequence limit checks",
-        len(output_sequences),
-        len(output_errors),
-    )
+    if output_errors:
+        LOG.info(
+            "Found %s sequences and %s errors after sequence limit checks",
+            len(output_sequences),
+            len(output_errors),
+        )
 
     return output_sequences, output_errors
 
@@ -394,82 +363,6 @@ def _group_by_folder_and_camera(
     return output_sequences
 
 
-def _split_sequences_by_cutoff_time(
-    input_sequences: T.Sequence[PointSequence], cutoff_time: float
-) -> list[PointSequence]:
-    def _should_split_by_cutoff_time(
-        prev: types.ImageMetadata, cur: types.ImageMetadata
-    ) -> bool:
-        time_diff = cur.time - prev.time
-        assert 0 <= time_diff, "sequence must be sorted by capture times"
-        should = cutoff_time < time_diff
-        if should:
-            LOG.debug(
-                "Split because the capture time gap %s seconds exceeds cutoff_time (%s seconds): %s: %s -> %s",
-                round(time_diff, 2),
-                round(cutoff_time, 2),
-                prev.filename.parent,
-                prev.filename.name,
-                cur.filename.name,
-            )
-        return should
-
-    output_sequences = []
-    for sequence in input_sequences:
-        output_sequences.extend(
-            split_sequence_by(sequence, should_split=_should_split_by_cutoff_time)
-        )
-
-    assert sum(len(s) for s in output_sequences) == sum(len(s) for s in input_sequences)
-
-    LOG.info(
-        "Found %s sequences after split by cutoff_time %s seconds",
-        len(output_sequences),
-        cutoff_time,
-    )
-
-    return output_sequences
-
-
-def _split_sequences_by_cutoff_distance(
-    input_sequences: T.Sequence[PointSequence], cutoff_distance: float
-) -> list[PointSequence]:
-    def _should_split_by_cutoff_distance(
-        prev: types.ImageMetadata, cur: types.ImageMetadata
-    ) -> bool:
-        distance = geo.gps_distance(
-            (prev.lat, prev.lon),
-            (cur.lat, cur.lon),
-        )
-        should = cutoff_distance < distance
-        if should:
-            LOG.debug(
-                "Split because the distance gap %s meters exceeds cutoff_distance (%s meters): %s: %s -> %s",
-                round(distance, 2),
-                round(cutoff_distance, 2),
-                prev.filename.parent,
-                prev.filename.name,
-                cur.filename.name,
-            )
-        return should
-
-    output_sequences = []
-    for sequence in input_sequences:
-        output_sequences.extend(
-            split_sequence_by(sequence, _should_split_by_cutoff_distance)
-        )
-
-    assert sum(len(s) for s in output_sequences) == sum(len(s) for s in input_sequences)
-
-    LOG.info(
-        "Found %s sequences after split by cutoff_distance %s meters",
-        len(output_sequences),
-        cutoff_distance,
-    )
-
-    return output_sequences
-
-
 def _check_sequences_duplication(
     input_sequences: T.Sequence[PointSequence],
     duplicate_distance: float,
@@ -492,88 +385,188 @@ def _check_sequences_duplication(
         len(s) for s in input_sequences
     )
 
-    LOG.info(
-        "Found %s sequences and %s errors after duplication check",
-        len(output_sequences),
-        len(output_errors),
-    )
+    if output_errors:
+        LOG.info(
+            "Found %s sequences and %s errors after duplication check",
+            len(output_sequences),
+            len(output_errors),
+        )
 
     return output_sequences, output_errors
 
 
+class SplitState(T.TypedDict, total=False):
+    sequence_images: int
+    sequence_file_size: int
+    sequence_pixels: int
+    split: bool
+    image: types.ImageMetadata
+
+
 def _split_sequences_by_limits(
     input_sequences: T.Sequence[PointSequence],
-    max_sequence_filesize_in_bytes: float,
-    max_sequence_pixels: float,
+    max_sequence_filesize_in_bytes: float | None = None,
+    max_sequence_pixels: float | None = None,
+    max_sequence_images: int | None = None,
+    cutoff_time: float | None = None,
+    cutoff_distance: float | None = None,
 ) -> list[PointSequence]:
-    max_sequence_images = constants.MAX_SEQUENCE_LENGTH
-    max_sequence_filesize = max_sequence_filesize_in_bytes
+    def _should_split_by_max_sequence_images(
+        state: SplitState, _: types.ImageMetadata
+    ) -> tuple[SplitState, bool]:
+        if max_sequence_images is None:
+            return state, False
 
-    def _should_split(image: types.ImageMetadata, sequence_state: dict) -> bool:
-        last_sequence_images = sequence_state.get("last_sequence_images", 0)
-        last_sequence_file_size = sequence_state.get("last_sequence_file_size", 0)
-        last_sequence_pixels = sequence_state.get("last_sequence_pixels", 0)
+        split = state.get("split", False)
 
-        # decent default values if width/height not available
-        width = 1024 if image.width is None else image.width
-        height = 1024 if image.height is None else image.height
-        pixels = width * height
+        if split:
+            new_sequence_images = 1
+        else:
+            new_sequence_images = state.get("sequence_images", 0) + 1
+            split = max_sequence_images < new_sequence_images
+            if split:
+                LOG.debug(
+                    f"Split because {new_sequence_images=} < {max_sequence_images=}"
+                )
+
+        state["sequence_images"] = new_sequence_images
+
+        return state, split
+
+    def _should_split_by_cutoff_time(
+        state: SplitState, image: types.ImageMetadata
+    ) -> tuple[SplitState, bool]:
+        if cutoff_time is None:
+            return state, False
+
+        split = state.get("split", False)
+
+        if split:
+            pass
+        else:
+            last_image = state.get("image")
+            if last_image is not None:
+                diff = image.time - last_image.time
+                split = cutoff_time < diff
+                if split:
+                    LOG.debug(f"Split because {cutoff_time=}  < {diff=}")
+
+        state["image"] = image
+
+        return state, split
+
+    def _should_split_by_cutoff_distance(
+        state: SplitState, image: types.ImageMetadata
+    ) -> tuple[SplitState, bool]:
+        if cutoff_distance is None:
+            return state, False
+
+        split = state.get("split", False)
+
+        if split:
+            pass
+        else:
+            last_image = state.get("image")
+            if last_image is not None:
+                diff = geo.gps_distance(
+                    (last_image.lat, last_image.lon), (image.lat, image.lon)
+                )
+                split = cutoff_distance < diff
+                if split:
+                    LOG.debug(f"Split because {cutoff_distance=} < {diff=}")
+
+        state["image"] = image
+
+        return state, split
+
+    def _should_split_by_max_sequence_filesize(
+        state: SplitState, image: types.ImageMetadata
+    ) -> tuple[SplitState, bool]:
+        if max_sequence_filesize_in_bytes is None:
+            return state, False
+
+        split = state.get("split", False)
 
         if image.filesize is None:
             filesize = os.path.getsize(image.filename)
         else:
             filesize = image.filesize
 
-        new_sequence_images = last_sequence_images + 1
-        new_sequence_file_size = last_sequence_file_size + filesize
-        new_sequence_pixels = last_sequence_pixels + pixels
-
-        if max_sequence_images < new_sequence_images:
-            LOG.debug(
-                "Split because the current sequence (%s) reaches the max number of images (%s)",
-                new_sequence_images,
-                max_sequence_images,
-            )
-            start_new_sequence = True
-        elif max_sequence_filesize < new_sequence_file_size:
-            LOG.debug(
-                "Split because the current sequence (%s) reaches the max filesize (%s)",
-                new_sequence_file_size,
-                max_sequence_filesize,
-            )
-            start_new_sequence = True
-        elif max_sequence_pixels < new_sequence_pixels:
-            LOG.debug(
-                "Split because the current sequence (%s) reaches the max pixels (%s)",
-                new_sequence_pixels,
-                max_sequence_pixels,
-            )
-            start_new_sequence = True
+        if split:
+            new_sequence_file_size = filesize
         else:
-            start_new_sequence = False
+            sequence_file_size = state.get("sequence_file_size", 0)
+            new_sequence_file_size = sequence_file_size + filesize
+            split = max_sequence_filesize_in_bytes < new_sequence_file_size
+            if split:
+                LOG.debug(
+                    f"Split because {max_sequence_filesize_in_bytes=} < {new_sequence_file_size=}"
+                )
+
+        state["sequence_file_size"] = new_sequence_file_size
+
+        return state, split
+
+    def _should_split_by_max_sequence_pixels(
+        state: SplitState, image: types.ImageMetadata
+    ) -> tuple[SplitState, bool]:
+        if max_sequence_pixels is None:
+            return state, False
 
-        if not start_new_sequence:
-            sequence_state["last_sequence_images"] = new_sequence_images
-            sequence_state["last_sequence_file_size"] = new_sequence_file_size
-            sequence_state["last_sequence_pixels"] = new_sequence_pixels
+        split = state.get("split", False)
+
+        # Decent default values if width/height not available
+        width = 1024 if image.width is None else image.width
+        height = 1024 if image.height is None else image.height
+        pixels = width * height
+
+        if split:
+            new_sequence_pixels = pixels
         else:
-            sequence_state["last_sequence_images"] = 1
-            sequence_state["last_sequence_file_size"] = filesize
-            sequence_state["last_sequence_pixels"] = pixels
+            sequence_pixels = state.get("sequence_pixels", 0)
+            new_sequence_pixels = sequence_pixels + pixels
+            split = max_sequence_pixels < new_sequence_pixels
+            if split:
+                LOG.debug(
+                    f"Split because {max_sequence_pixels=} < {new_sequence_pixels=}"
+                )
+
+        state["sequence_pixels"] = new_sequence_pixels
+
+        return state, split
 
-        return start_new_sequence
+    def _should_split_agg(
+        state: SplitState, image: types.ImageMetadata
+    ) -> tuple[SplitState, bool]:
+        split = False
+
+        for should_split in [
+            _should_split_by_max_sequence_images,
+            _should_split_by_cutoff_time,
+            _should_split_by_cutoff_distance,
+            _should_split_by_max_sequence_filesize,
+            _should_split_by_max_sequence_pixels,
+        ]:
+            state, split = should_split(state, image)
+            if split:
+                state["split"] = True
+
+        return state, split
 
     output_sequences = []
     for sequence in input_sequences:
         output_sequences.extend(
-            split_sequence_by_agg(
-                sequence, should_split_with_sequence_state=_should_split
+            split_sequence_by(
+                sequence, _should_split_agg, initial=T.cast(SplitState, {})
             )
         )
 
     assert sum(len(s) for s in output_sequences) == sum(len(s) for s in input_sequences)
 
-    LOG.info("Found %s sequences after split by sequence limits", len(output_sequences))
+    if len(input_sequences) != len(output_sequences):
+        LOG.info(
+            f"Split {len(input_sequences)} into {len(output_sequences)} sequences by limits"
+        )
 
     return output_sequences
 
@@ -632,9 +625,15 @@ def process_sequence_properties(
         for sequence in sequences:
             _interpolate_subsecs_for_sorting(sequence)
 
-        # Split sequences by cutoff time
+        # Split sequences by max number of images, max filesize, max pixels, and cutoff time
         # NOTE: Do not split by distance here because it affects the speed limit check
-        sequences = _split_sequences_by_cutoff_time(sequences, cutoff_time=cutoff_time)
+        sequences = _split_sequences_by_limits(
+            sequences,
+            max_sequence_filesize_in_bytes=max_sequence_filesize_in_bytes,
+            max_sequence_pixels=max_sequence_pixels,
+            max_sequence_images=constants.MAX_SEQUENCE_LENGTH,
+            cutoff_time=cutoff_time,
+        )
 
         # Duplication check
         sequences, errors = _check_sequences_duplication(
@@ -651,13 +650,6 @@ def process_sequence_properties(
                     image.angle = None
             geo.interpolate_directions_if_none(sequence)
 
-        # Split sequences by max number of images, max filesize, and max pixels
-        sequences = _split_sequences_by_limits(
-            sequences,
-            max_sequence_filesize_in_bytes=max_sequence_filesize_in_bytes,
-            max_sequence_pixels=max_sequence_pixels,
-        )
-
         # Check limits for sequences
         sequences, errors = _check_sequences_by_limits(
             sequences,
@@ -668,7 +660,7 @@ def process_sequence_properties(
 
         # Split sequences by cutoff distance
         # NOTE: The speed limit check probably rejects most of anomalies
-        sequences = _split_sequences_by_cutoff_distance(
+        sequences = _split_sequences_by_limits(
             sequences, cutoff_distance=cutoff_distance
         )
 

From 46008cf8b059d1c1aea3c91b581a04bcb74ca4d9 Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Wed, 2 Jul 2025 15:45:36 -0700
Subject: [PATCH 03/12] add doctests

---
 .../process_sequence_properties.py            | 44 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 21648def2..431318c1b 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -21,7 +21,49 @@ def split_sequence_by(
     sequence: T.Iterable[S], reduce: T.Callable[[R, S], tuple[R, bool]], initial: R
 ) -> list[list[S]]:
     """
-    Split a sequence into multiple sequences by should_split(prev, cur) => True
+    Split a sequence into multiple subsequences based on a reduction function.
+
+    The function processes each element through a reduce function that maintains
+    state and determines whether to split the sequence at that point. When a split
+    is triggered, a new subsequence starts with the current element.
+
+    Args:
+        sequence: An iterable of elements to split
+        reduce: A function that takes (accumulated_state, current_element) and
+               returns (new_state, should_split). If should_split is True,
+               a new subsequence starts with the current element.
+        initial: The initial state value passed to the reduce function
+
+    Returns:
+        A list of subsequences, where each subsequence is a list of elements
+
+    Examples:
+        >>> # Split on even numbers
+        >>> def split_on_even(count, x):
+        ...     return count + 1, x % 2 == 0
+        >>> split_sequence_by([1, 3, 2, 4, 5, 6, 7], split_on_even, 0)
+        [[1, 3], [2], [4, 5], [6, 7]]
+
+        >>> # Split when sum exceeds threshold
+        >>> def split_when_sum_exceeds_5(total, x):
+        ...     total += x
+        ...     return (x, True) if total > 5 else (total, False)
+        >>> split_sequence_by([1, 2, 3, 4, 1, 2], split_when_sum_exceeds_5, 0)
+        [[1, 2], [3], [4, 1], [2]]
+
+        >>> # Split on specific values
+        >>> def split_on_zero(_, x):
+        ...     return None, x == 0
+        >>> split_sequence_by([1, 2, 0, 3, 4, 0, 5], split_on_zero, None)
+        [[1, 2], [0, 3, 4], [0, 5]]
+
+        >>> # Empty sequence
+        >>> split_sequence_by([], lambda s, x: (s, False), 0)
+        []
+
+        >>> # Single element
+        >>> split_sequence_by([42], lambda s, x: (s, False), 0)
+        [[42]]
     """
 
     output_sequences: list[list[S]] = []

From 061893aa516ffde28fe0ec8f17a31e61de3d56bc Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Wed, 2 Jul 2025 18:20:18 -0700
Subject: [PATCH 04/12] fix tests

---
 .../process_sequence_properties.py            | 246 ++++++++++--------
 tests/unit/test_sequence_processing.py        | 181 ++++++-------
 2 files changed, 212 insertions(+), 215 deletions(-)

diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 431318c1b..98da05200 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import functools
 import itertools
 import logging
 import math
@@ -441,157 +442,174 @@ class SplitState(T.TypedDict, total=False):
     sequence_images: int
     sequence_file_size: int
     sequence_pixels: int
-    split: bool
     image: types.ImageMetadata
 
 
-def _split_sequences_by_limits(
-    input_sequences: T.Sequence[PointSequence],
-    max_sequence_filesize_in_bytes: float | None = None,
-    max_sequence_pixels: float | None = None,
-    max_sequence_images: int | None = None,
-    cutoff_time: float | None = None,
-    cutoff_distance: float | None = None,
-) -> list[PointSequence]:
-    def _should_split_by_max_sequence_images(
-        state: SplitState, _: types.ImageMetadata
-    ) -> tuple[SplitState, bool]:
-        if max_sequence_images is None:
-            return state, False
+def _should_split_by_max_sequence_images(
+    state: SplitState,
+    _: types.ImageMetadata,
+    max_sequence_images: int,
+    split: bool = False,
+) -> tuple[SplitState, bool]:
+    if not split:
+        new_sequence_images = state.get("sequence_images", 0) + 1
+        split = max_sequence_images < new_sequence_images
+        if split:
+            LOG.debug(f"Split because {new_sequence_images=} < {max_sequence_images=}")
 
-        split = state.get("split", False)
+    if split:
+        new_sequence_images = 1
 
-        if split:
-            new_sequence_images = 1
-        else:
-            new_sequence_images = state.get("sequence_images", 0) + 1
-            split = max_sequence_images < new_sequence_images
-            if split:
-                LOG.debug(
-                    f"Split because {new_sequence_images=} < {max_sequence_images=}"
-                )
+    state["sequence_images"] = new_sequence_images
 
-        state["sequence_images"] = new_sequence_images
+    return state, split
 
-        return state, split
 
-    def _should_split_by_cutoff_time(
-        state: SplitState, image: types.ImageMetadata
-    ) -> tuple[SplitState, bool]:
-        if cutoff_time is None:
-            return state, False
+def _should_split_by_cutoff_time(
+    state: SplitState,
+    image: types.ImageMetadata,
+    cutoff_time: float,
+    split: bool = False,
+) -> tuple[SplitState, bool]:
+    if not split:
+        last_image = state.get("image")
+        if last_image is not None:
+            diff = image.time - last_image.time
+            split = cutoff_time < diff
+            if split:
+                LOG.debug(f"Split because {cutoff_time=}  < {diff=}")
 
-        split = state.get("split", False)
+    state["image"] = image
 
-        if split:
-            pass
-        else:
-            last_image = state.get("image")
-            if last_image is not None:
-                diff = image.time - last_image.time
-                split = cutoff_time < diff
-                if split:
-                    LOG.debug(f"Split because {cutoff_time=}  < {diff=}")
+    return state, split
 
-        state["image"] = image
 
-        return state, split
+def _should_split_by_cutoff_distance(
+    state: SplitState,
+    image: types.ImageMetadata,
+    cutoff_distance: float,
+    split: bool = False,
+) -> tuple[SplitState, bool]:
+    if not split:
+        last_image = state.get("image")
+        if last_image is not None:
+            diff = geo.gps_distance(
+                (last_image.lat, last_image.lon), (image.lat, image.lon)
+            )
+            split = cutoff_distance < diff
+            if split:
+                LOG.debug(f"Split because {cutoff_distance=} < {diff=}")
 
-    def _should_split_by_cutoff_distance(
-        state: SplitState, image: types.ImageMetadata
-    ) -> tuple[SplitState, bool]:
-        if cutoff_distance is None:
-            return state, False
+    state["image"] = image
 
-        split = state.get("split", False)
+    return state, split
 
+
+def _should_split_by_max_sequence_filesize(
+    state: SplitState,
+    image: types.ImageMetadata,
+    max_sequence_filesize_in_bytes: float,
+    split: bool = False,
+) -> tuple[SplitState, bool]:
+    if image.filesize is None:
+        filesize = os.path.getsize(image.filename)
+    else:
+        filesize = image.filesize
+
+    if not split:
+        new_sequence_file_size = state.get("sequence_file_size", 0) + filesize
+        split = max_sequence_filesize_in_bytes < new_sequence_file_size
         if split:
-            pass
-        else:
-            last_image = state.get("image")
-            if last_image is not None:
-                diff = geo.gps_distance(
-                    (last_image.lat, last_image.lon), (image.lat, image.lon)
-                )
-                split = cutoff_distance < diff
-                if split:
-                    LOG.debug(f"Split because {cutoff_distance=} < {diff=}")
+            LOG.debug(
+                f"Split because {max_sequence_filesize_in_bytes=} < {new_sequence_file_size=}"
+            )
 
-        state["image"] = image
+    if split:
+        new_sequence_file_size = filesize
 
-        return state, split
+    state["sequence_file_size"] = new_sequence_file_size
 
-    def _should_split_by_max_sequence_filesize(
-        state: SplitState, image: types.ImageMetadata
-    ) -> tuple[SplitState, bool]:
-        if max_sequence_filesize_in_bytes is None:
-            return state, False
+    return state, split
 
-        split = state.get("split", False)
 
-        if image.filesize is None:
-            filesize = os.path.getsize(image.filename)
-        else:
-            filesize = image.filesize
+def _should_split_by_max_sequence_pixels(
+    state: SplitState,
+    image: types.ImageMetadata,
+    max_sequence_pixels: float,
+    split: bool = False,
+) -> tuple[SplitState, bool]:
+    # Decent default values if width/height not available
+    width = 1024 if image.width is None else image.width
+    height = 1024 if image.height is None else image.height
+    pixels = width * height
 
+    if not split:
+        new_sequence_pixels = state.get("sequence_pixels", 0) + pixels
+        split = max_sequence_pixels < new_sequence_pixels
         if split:
-            new_sequence_file_size = filesize
-        else:
-            sequence_file_size = state.get("sequence_file_size", 0)
-            new_sequence_file_size = sequence_file_size + filesize
-            split = max_sequence_filesize_in_bytes < new_sequence_file_size
-            if split:
-                LOG.debug(
-                    f"Split because {max_sequence_filesize_in_bytes=} < {new_sequence_file_size=}"
-                )
+            LOG.debug(f"Split because {max_sequence_pixels=} < {new_sequence_pixels=}")
 
-        state["sequence_file_size"] = new_sequence_file_size
+    if split:
+        new_sequence_pixels = pixels
 
-        return state, split
+    state["sequence_pixels"] = new_sequence_pixels
 
-    def _should_split_by_max_sequence_pixels(
-        state: SplitState, image: types.ImageMetadata
-    ) -> tuple[SplitState, bool]:
-        if max_sequence_pixels is None:
-            return state, False
+    return state, split
 
-        split = state.get("split", False)
 
-        # Decent default values if width/height not available
-        width = 1024 if image.width is None else image.width
-        height = 1024 if image.height is None else image.height
-        pixels = width * height
+def _split_sequences_by_limits(
+    input_sequences: T.Sequence[PointSequence],
+    max_sequence_filesize_in_bytes: float | None = None,
+    max_sequence_pixels: float | None = None,
+    max_sequence_images: int | None = None,
+    cutoff_time: float | None = None,
+    cutoff_distance: float | None = None,
+) -> list[PointSequence]:
+    should_splits = []
 
-        if split:
-            new_sequence_pixels = pixels
-        else:
-            sequence_pixels = state.get("sequence_pixels", 0)
-            new_sequence_pixels = sequence_pixels + pixels
-            split = max_sequence_pixels < new_sequence_pixels
-            if split:
-                LOG.debug(
-                    f"Split because {max_sequence_pixels=} < {new_sequence_pixels=}"
-                )
+    if max_sequence_images is not None:
+        should_splits.append(
+            functools.partial(
+                _should_split_by_max_sequence_images,
+                max_sequence_images=max_sequence_images,
+            )
+        )
 
-        state["sequence_pixels"] = new_sequence_pixels
+    if cutoff_time is not None:
+        should_splits.append(
+            functools.partial(_should_split_by_cutoff_time, cutoff_time=cutoff_time)
+        )
 
-        return state, split
+    if cutoff_distance is not None:
+        should_splits.append(
+            functools.partial(
+                _should_split_by_cutoff_distance, cutoff_distance=cutoff_distance
+            )
+        )
+
+    if max_sequence_filesize_in_bytes is not None:
+        should_splits.append(
+            functools.partial(
+                _should_split_by_max_sequence_filesize,
+                max_sequence_filesize_in_bytes=max_sequence_filesize_in_bytes,
+            )
+        )
+
+    if max_sequence_pixels is not None:
+        should_splits.append(
+            functools.partial(
+                _should_split_by_max_sequence_pixels,
+                max_sequence_pixels=max_sequence_pixels,
+            )
+        )
 
     def _should_split_agg(
         state: SplitState, image: types.ImageMetadata
     ) -> tuple[SplitState, bool]:
         split = False
 
-        for should_split in [
-            _should_split_by_max_sequence_images,
-            _should_split_by_cutoff_time,
-            _should_split_by_cutoff_distance,
-            _should_split_by_max_sequence_filesize,
-            _should_split_by_max_sequence_pixels,
-        ]:
-            state, split = should_split(state, image)
-            if split:
-                state["split"] = True
+        for should_split in should_splits:
+            state, split = should_split(state, image, split=split)
 
         return state, split
 
diff --git a/tests/unit/test_sequence_processing.py b/tests/unit/test_sequence_processing.py
index 5ff306393..ca8c0f52e 100644
--- a/tests/unit/test_sequence_processing.py
+++ b/tests/unit/test_sequence_processing.py
@@ -6,7 +6,6 @@
 import pytest
 
 from mapillary_tools import (
-    constants,
     exceptions,
     geo,
     process_geotag_properties as pgp,
@@ -21,22 +20,19 @@ def _make_image_metadata(
     lng: float,
     lat: float,
     time: float,
-    angle: T.Optional[float] = None,
+    angle: float | None = None,
+    filesize: int = 0,
     **kwargs,
 ) -> types.ImageMetadata:
-    filename = filename.resolve()
-    if not filename.exists():
-        filename.parent.mkdir(parents=True, exist_ok=True)
-        with filename.open("w"):
-            pass
     return types.ImageMetadata(
-        filename=filename,
+        filename=filename.resolve(),
+        filesize=filesize,
         lon=lng,
         lat=lat,
         time=time,
         alt=None,
-        **kwargs,
         angle=angle,
+        **kwargs,
     )
 
 
@@ -349,7 +345,6 @@ def test_interpolation(tmpdir: py.path.local):
 
 
 def test_subsec_interpolation(tmpdir: py.path.local):
-    constants.MAX_SEQUENCE_LENGTH = 2
     curdir = tmpdir.mkdir("hello222").mkdir("world333")
     sequence: T.List[types.Metadata] = [
         # s1
@@ -449,7 +444,7 @@ def test_process_finalize(setup_data):
         {
             "filename": str(test_exif),
             "filetype": "image",
-            "filesize": None,
+            "filesize": 0,
             "MAPFilename": "test_exif.jpg",
             "MAPLatitude": 1,
             "MAPLongitude": 1,
@@ -488,11 +483,10 @@ def test_process_finalize(setup_data):
 
 
 def test_cut_by_pixels(tmpdir: py.path.local):
-    curdir = tmpdir.mkdir("hello77").mkdir("world88")
     sequence: T.List[types.Metadata] = [
         # s2
         _make_image_metadata(
-            Path(curdir) / Path("./a.jpg"),
+            Path(tmpdir) / Path("./a.jpg"),
             2,
             2,
             1,
@@ -501,7 +495,7 @@ def test_cut_by_pixels(tmpdir: py.path.local):
             height=2,
         ),
         _make_image_metadata(
-            Path(curdir) / Path("./b.jpg"),
+            Path(tmpdir) / Path("./b.jpg"),
             2.00001,
             2.00001,
             20,
@@ -511,7 +505,7 @@ def test_cut_by_pixels(tmpdir: py.path.local):
         ),
         # s1
         _make_image_metadata(
-            Path(curdir) / Path("./c.jpg"),
+            Path(tmpdir) / Path("./c.jpg"),
             2.00002,
             2.00002,
             30,
@@ -541,10 +535,9 @@ def test_cut_by_pixels(tmpdir: py.path.local):
 
 
 def test_video_error(tmpdir: py.path.local):
-    curdir = tmpdir.mkdir("hello222").mkdir("videos")
     sequence: T.List[types.Metadata] = [
         types.VideoMetadata(
-            Path(curdir) / Path("test_video_null_island.mp4"),
+            Path(tmpdir) / Path("test_video_null_island.mp4"),
             types.FileType.VIDEO,
             points=[
                 geo.Point(1, -0.00001, -0.00001, 1, angle=None),
@@ -556,7 +549,7 @@ def test_video_error(tmpdir: py.path.local):
             filesize=123,
         ),
         types.VideoMetadata(
-            Path(curdir) / Path("test_video_too_fast.mp4"),
+            Path(tmpdir) / Path("test_video_too_fast.mp4"),
             types.FileType.VIDEO,
             points=[
                 geo.Point(1, 1, 1, 1, angle=None),
@@ -568,7 +561,7 @@ def test_video_error(tmpdir: py.path.local):
             filesize=123,
         ),
         types.VideoMetadata(
-            Path(curdir) / Path("test_video_file_too_large.mp4"),
+            Path(tmpdir) / Path("test_video_file_too_large.mp4"),
             types.FileType.VIDEO,
             points=[
                 geo.Point(1, 1, 1, 1, angle=None),
@@ -579,7 +572,7 @@ def test_video_error(tmpdir: py.path.local):
             filesize=1024 * 1024 * 1024 * 200,
         ),
         types.VideoMetadata(
-            Path(curdir) / Path("test_good.mp4"),
+            Path(tmpdir) / Path("test_good.mp4"),
             types.FileType.VIDEO,
             points=[
                 geo.Point(1, 1, 1, 1, angle=None),
@@ -620,101 +613,87 @@ def test_video_error(tmpdir: py.path.local):
     )
 
 
-def test_split_sequence_by():
-    """Test split_sequence_by function."""
-    # Create test points
-    p1 = geo.Point(1, 1.00000, 1.00000, 1, angle=0)
-    p2 = geo.Point(2, 1.00001, 1.00001, 2, angle=0)
-    p3 = geo.Point(3, 1.00002, 1.00002, 3, angle=0)
-    p4 = geo.Point(10, 1.00003, 1.00003, 4, angle=0)  # Large time gap
-    p5 = geo.Point(11, 1.00004, 1.00004, 5, angle=0)
-    p6 = geo.Point(12, 1.10000, 1.10000, 6, angle=0)  # Large distance gap
-    p7 = geo.Point(13, 1.10001, 1.10001, 7, angle=0)
-
-    # Create a sequence of points
-    sequence = [p1, p2, p3, p4, p5, p6, p7]
-
-    # Test split by time gaps (> 5 seconds)
-    split_by_time = lambda prev, cur: cur.time - prev.time > 5
-    sequences = psp.split_sequence_by(sequence, split_by_time)
-
-    # Should be split into two sequences [p1,p2,p3], [p4,p5,p6,p7]
-    assert len(sequences) == 2
-    assert sequences[0] == [p1, p2, p3]
-    assert sequences[1] == [p4, p5, p6, p7]
-
-    # Test split by large distance gaps
-    def split_by_distance(prev, cur):
-        distance = geo.gps_distance(
-            (prev.lat, prev.lon),
-            (cur.lat, cur.lon),
+def test_split_sequence_by_filesize(tmpdir):
+    sequence: T.List[types.Metadata] = [
+        # s1
+        _make_image_metadata(
+            Path(tmpdir) / Path("./a.jpg"), 2, 2, 1, filesize=110 * 1024 * 1024 * 1024
+        ),
+        # s2
+        _make_image_metadata(
+            Path(tmpdir) / Path("./b.jpg"), 2.00001, 2.00001, 2, filesize=1
+        ),
+        _make_image_metadata(
+            Path(tmpdir) / Path("./c.jpg"), 2.00002, 2.00002, 2, filesize=1
+        ),
+    ]
+
+    metadatas = psp.process_sequence_properties(sequence)
+    assert 2 == len({m.MAPSequenceUUID for m in metadatas})  # type: ignore
+
+
+def test_split_sequence_by_image_count(tmpdir):
+    max_allowed_images = 1000
+
+    sequence = []
+    for i in range(1, max_allowed_images + 1):
+        image = _make_image_metadata(
+            Path(tmpdir) / Path(f"./a{i}.jpg"),
+            1 + i * 0.00001,
+            1 + i * 0.00001,
+            i,
+            filesize=1,
         )
-        should = distance > 1000  # Split if distance > 1000 meters
-        return should
+        sequence.append(image)
 
-    sequences = psp.split_sequence_by(sequence, split_by_distance)
+    metadatas = psp.process_sequence_properties(sequence)
+    assert 1 == len({m.MAPSequenceUUID for m in metadatas}), metadatas  # type: ignore
 
-    # Should be split into two sequences [p1,p2,p3,p4,p5], [p6,p7]
-    assert len(sequences) == 2
-    assert sequences[0] == [p1, p2, p3, p4, p5]
-    assert sequences[1] == [p6, p7]
 
-    # Test empty sequence
-    empty_sequences = psp.split_sequence_by([], split_by_time)
-    assert len(empty_sequences) == 0
+def test_split_sequence_by_image_count_split(tmpdir):
+    max_allowed_images = 1000
 
-    # Test single point sequence
-    single_point = [p1]
-    single_sequences = psp.split_sequence_by(single_point, split_by_time)
-    assert len(single_sequences) == 1
-    assert single_sequences[0] == [p1]
+    sequence = []
+    for i in range(1, max_allowed_images + 2):
+        image = _make_image_metadata(
+            Path(tmpdir) / Path(f"./a{i}.jpg"),
+            1 + i * 0.00001,
+            1 + i * 0.00001,
+            i,
+            filesize=1,
+        )
+        sequence.append(image)
 
-    sequences = psp.split_sequence_by([], split_by_time)
-    assert len(sequences) == 0
+    metadatas = psp.process_sequence_properties(sequence)
+    assert 2 == len({m.MAPSequenceUUID for m in metadatas}), metadatas  # type: ignore
 
 
-def test_split_sequence_by_agg(tmpdir):
-    curdir = tmpdir.mkdir("hello77").mkdir("world88")
+def test_split_sequence_by_cutoff_time(tmpdir):
     sequence: T.List[types.Metadata] = [
         # s1
-        _make_image_metadata(
-            Path(curdir) / Path("./a.jpg"),
-            2,
-            2,
-            1,
-            filesize=110 * 1024 * 1024 * 1024,
-        ),
+        _make_image_metadata(Path(tmpdir) / Path("./a.jpg"), 1, 1, 1, filesize=1),
         # s2
         _make_image_metadata(
-            Path(curdir) / Path("./b.jpg"),
-            2.00001,
-            2.00001,
-            2,
-            filesize=1,
+            Path(tmpdir) / Path("./b.jpg"), 1.00001, 1.00001, 600, filesize=1
         ),
-        # s3
         _make_image_metadata(
-            Path(curdir) / Path("./c.jpg"),
-            2.00002,
-            2.00002,
-            3,
-            filesize=110 * 1024 * 1024 * 1024 - 1,
-        ),
-        _make_image_metadata(
-            Path(curdir) / Path("./c.jpg"),
-            2.00003,
-            2.00003,
-            4,
-            filesize=1,
+            Path(tmpdir) / Path("./c.jpg"), 1.00002, 1.00002, 601, filesize=1
         ),
     ]
 
-    metadatas = psp.process_sequence_properties(
-        sequence,
-        cutoff_distance=1000000000,
-        cutoff_time=100,
-        interpolate_directions=True,
-        duplicate_distance=0.1,
-        duplicate_angle=0.1,
-    )
-    assert 3 == len({m.MAPSequenceUUID for m in metadatas})  # type: ignore
+    metadatas = psp.process_sequence_properties(sequence)
+    assert 2 == len({m.MAPSequenceUUID for m in metadatas}), metadatas  # type: ignore
+
+
+def test_split_sequence_no_split(tmpdir):
+    sequence: T.List[types.Metadata] = [
+        # s1
+        _make_image_metadata(Path(tmpdir) / Path("./a.jpg"), 1, 1, 1),
+        # s2
+        _make_image_metadata(Path(tmpdir) / Path("./b.jpg"), 1.00001, 1.00001, 2),
+        # s3
+        _make_image_metadata(Path(tmpdir) / Path("./c.jpg"), 1.00002, 1.00002, 3),
+    ]
+
+    metadatas = psp.process_sequence_properties(sequence)
+    assert 1 == len({m.MAPSequenceUUID for m in metadatas}), metadatas  # type: ignore

From 05cf2909fd496974c6bf344c0b8163e36cb96dc1 Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Wed, 2 Jul 2025 18:34:47 -0700
Subject: [PATCH 05/12] support inf

---
 .../process_sequence_properties.py            | 52 +++++++++++--------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 98da05200..97073164f 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -182,9 +182,12 @@ def _interpolate_subsecs_for_sorting(sequence: PointSequence) -> None:
         )
 
 
-def _parse_filesize_in_bytes(filesize_str: str) -> int:
+def _parse_filesize_in_bytes(filesize_str: str) -> int | None:
     filesize_str = filesize_str.strip().upper()
 
+    if filesize_str in ["INF", "INFINITY"]:
+        return None
+
     try:
         if filesize_str.endswith("B"):
             return int(filesize_str[:-1])
@@ -202,9 +205,12 @@ def _parse_filesize_in_bytes(filesize_str: str) -> int:
         )
 
 
-def _parse_pixels(pixels_str: str) -> int:
+def _parse_pixels(pixels_str: str) -> int | None:
     pixels_str = pixels_str.strip().upper()
 
+    if pixels_str in ["INF", "INFINITY"]:
+        return None
+
     try:
         if pixels_str.endswith("K"):
             return int(pixels_str[:-1]) * 1000
@@ -256,7 +262,7 @@ def _is_video_stationary(
 
 def _check_video_limits(
     video_metadatas: T.Iterable[types.VideoMetadata],
-    max_sequence_filesize_in_bytes: int,
+    max_sequence_filesize_in_bytes: int | None,
     max_avg_speed: float,
     max_radius_for_stationary_check: float,
 ) -> tuple[list[types.VideoMetadata], list[types.ErrorMetadata]]:
@@ -272,15 +278,16 @@ def _check_video_limits(
             if is_stationary:
                 raise exceptions.MapillaryStationaryVideoError("Stationary video")
 
-            video_filesize = (
-                utils.get_file_size(video_metadata.filename)
-                if video_metadata.filesize is None
-                else video_metadata.filesize
-            )
-            if video_filesize > max_sequence_filesize_in_bytes:
-                raise exceptions.MapillaryFileTooLargeError(
-                    f"Video file size exceeds the maximum allowed file size ({max_sequence_filesize_in_bytes} bytes)",
+            if max_sequence_filesize_in_bytes is not None:
+                video_filesize = (
+                    utils.get_file_size(video_metadata.filename)
+                    if video_metadata.filesize is None
+                    else video_metadata.filesize
                 )
+                if video_filesize > max_sequence_filesize_in_bytes:
+                    raise exceptions.MapillaryFileTooLargeError(
+                        f"Video file size exceeds the maximum allowed file size ({max_sequence_filesize_in_bytes} bytes)",
+                    )
 
             contains_null_island = any(
                 p.lat == 0 and p.lon == 0 for p in video_metadata.points
@@ -321,7 +328,7 @@ def _check_video_limits(
 
 def _check_sequences_by_limits(
     input_sequences: T.Sequence[PointSequence],
-    max_sequence_filesize_in_bytes: int,
+    max_sequence_filesize_in_bytes: int | None,
     max_avg_speed: float,
 ) -> tuple[list[PointSequence], list[types.ErrorMetadata]]:
     output_sequences: list[PointSequence] = []
@@ -336,10 +343,11 @@ def _check_sequences_by_limits(
         )
 
         try:
-            if sequence_filesize > max_sequence_filesize_in_bytes:
-                raise exceptions.MapillaryFileTooLargeError(
-                    f"Sequence file size exceeds the maximum allowed file size ({max_sequence_filesize_in_bytes} bytes)",
-                )
+            if max_sequence_filesize_in_bytes is not None:
+                if sequence_filesize > max_sequence_filesize_in_bytes:
+                    raise exceptions.MapillaryFileTooLargeError(
+                        f"Sequence file size exceeds the maximum allowed file size ({max_sequence_filesize_in_bytes} bytes)",
+                    )
 
             contains_null_island = any(
                 image.lat == 0 and image.lon == 0 for image in sequence
@@ -358,9 +366,7 @@ def _check_sequences_by_limits(
             for image in sequence:
                 output_errors.append(
                     types.describe_error_metadata(
-                        exc=ex,
-                        filename=image.filename,
-                        filetype=types.FileType.IMAGE,
+                        exc=ex, filename=image.filename, filetype=types.FileType.IMAGE
                     )
                 )
 
@@ -508,7 +514,7 @@ def _should_split_by_cutoff_distance(
 def _should_split_by_max_sequence_filesize(
     state: SplitState,
     image: types.ImageMetadata,
-    max_sequence_filesize_in_bytes: float,
+    max_sequence_filesize_in_bytes: int,
     split: bool = False,
 ) -> tuple[SplitState, bool]:
     if image.filesize is None:
@@ -535,7 +541,7 @@ def _should_split_by_max_sequence_filesize(
 def _should_split_by_max_sequence_pixels(
     state: SplitState,
     image: types.ImageMetadata,
-    max_sequence_pixels: float,
+    max_sequence_pixels: int,
     split: bool = False,
 ) -> tuple[SplitState, bool]:
     # Decent default values if width/height not available
@@ -559,8 +565,8 @@ def _should_split_by_max_sequence_pixels(
 
 def _split_sequences_by_limits(
     input_sequences: T.Sequence[PointSequence],
-    max_sequence_filesize_in_bytes: float | None = None,
-    max_sequence_pixels: float | None = None,
+    max_sequence_filesize_in_bytes: int | None = None,
+    max_sequence_pixels: int | None = None,
     max_sequence_images: int | None = None,
     cutoff_time: float | None = None,
     cutoff_distance: float | None = None,

From bf7c36a50f4d79d37d9aeec4440bc2f6c15228fc Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Wed, 2 Jul 2025 18:36:48 -0700
Subject: [PATCH 06/12] type

---
 tests/unit/test_sequence_processing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/test_sequence_processing.py b/tests/unit/test_sequence_processing.py
index ca8c0f52e..0034fb90d 100644
--- a/tests/unit/test_sequence_processing.py
+++ b/tests/unit/test_sequence_processing.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import itertools
 import typing as T
 from pathlib import Path

From 16382096958f557e1e0c99d20aac98d4da7c15d1 Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Thu, 3 Jul 2025 00:23:46 -0700
Subject: [PATCH 07/12] reformat constants.py

---
 mapillary_tools/constants.py | 88 +++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 42 deletions(-)

diff --git a/mapillary_tools/constants.py b/mapillary_tools/constants.py
index fc1fc4007..858f14d00 100644
--- a/mapillary_tools/constants.py
+++ b/mapillary_tools/constants.py
@@ -8,44 +8,33 @@
 
 
 def _yes_or_no(val: str) -> bool:
-    return val.strip().upper() in [
-        "1",
-        "TRUE",
-        "YES",
-    ]
+    return val.strip().upper() in ["1", "TRUE", "YES"]
 
 
-# In meters
-CUTOFF_DISTANCE = float(os.getenv(_ENV_PREFIX + "CUTOFF_DISTANCE", 600))
+###################
+##### GENERAL #####
+###################
+USER_DATA_DIR = appdirs.user_data_dir(appname="mapillary_tools", appauthor="Mapillary")
+PROMPT_DISABLED: bool = _yes_or_no(os.getenv(_ENV_PREFIX + "PROMPT_DISABLED", "NO"))
+
+
+############################
+##### VIDEO PROCESSING #####
+############################
 # In seconds
-CUTOFF_TIME = float(os.getenv(_ENV_PREFIX + "CUTOFF_TIME", 60))
-DUPLICATE_DISTANCE = float(os.getenv(_ENV_PREFIX + "DUPLICATE_DISTANCE", 0.1))
-DUPLICATE_ANGLE = float(os.getenv(_ENV_PREFIX + "DUPLICATE_ANGLE", 5))
-MAX_AVG_SPEED = float(
-    os.getenv(_ENV_PREFIX + "MAX_AVG_SPEED", 400_000 / 3600)
-)  # 400 KM/h
-# in seconds
 VIDEO_SAMPLE_INTERVAL = float(os.getenv(_ENV_PREFIX + "VIDEO_SAMPLE_INTERVAL", -1))
-# in meters
+# In meters
 VIDEO_SAMPLE_DISTANCE = float(os.getenv(_ENV_PREFIX + "VIDEO_SAMPLE_DISTANCE", 3))
 VIDEO_DURATION_RATIO = float(os.getenv(_ENV_PREFIX + "VIDEO_DURATION_RATIO", 1))
 FFPROBE_PATH: str = os.getenv(_ENV_PREFIX + "FFPROBE_PATH", "ffprobe")
 FFMPEG_PATH: str = os.getenv(_ENV_PREFIX + "FFMPEG_PATH", "ffmpeg")
-# When not set, MT will try to check both "exiftool" and "exiftool.exe" from $PATH
-EXIFTOOL_PATH: str | None = os.getenv(_ENV_PREFIX + "EXIFTOOL_PATH")
+EXIFTOOL_PATH: str = os.getenv(_ENV_PREFIX + "EXIFTOOL_PATH", "exiftool")
 IMAGE_DESCRIPTION_FILENAME = os.getenv(
     _ENV_PREFIX + "IMAGE_DESCRIPTION_FILENAME", "mapillary_image_description.json"
 )
 SAMPLED_VIDEO_FRAMES_FILENAME = os.getenv(
     _ENV_PREFIX + "SAMPLED_VIDEO_FRAMES_FILENAME", "mapillary_sampled_video_frames"
 )
-USER_DATA_DIR = appdirs.user_data_dir(appname="mapillary_tools", appauthor="Mapillary")
-# The chunk size in MB (see chunked transfer encoding https://en.wikipedia.org/wiki/Chunked_transfer_encoding)
-# for uploading data to MLY upload service.
-# Changing this size does not change the number of requests nor affect upload performance,
-# but it affects the responsiveness of the upload progress bar
-UPLOAD_CHUNK_SIZE_MB = float(os.getenv(_ENV_PREFIX + "UPLOAD_CHUNK_SIZE_MB", 1))
-
 # DoP value, the lower the better
 # See https://github.com/gopro/gpmf-parser#hero5-black-with-gps-enabled-adds
 # It is used to filter out noisy points
@@ -54,11 +43,25 @@ def _yes_or_no(val: str) -> bool:
 GOPRO_GPS_FIXES: set[int] = set(
     int(fix) for fix in os.getenv(_ENV_PREFIX + "GOPRO_GPS_FIXES", "2,3").split(",")
 )
-MAX_UPLOAD_RETRIES: int = int(os.getenv(_ENV_PREFIX + "MAX_UPLOAD_RETRIES", 200))
-
 # GPS precision, in meters, is used to filter outliers
 GOPRO_GPS_PRECISION = float(os.getenv(_ENV_PREFIX + "GOPRO_GPS_PRECISION", 15))
+MAPILLARY__EXPERIMENTAL_ENABLE_IMU: bool = _yes_or_no(
+    os.getenv("MAPILLARY__EXPERIMENTAL_ENABLE_IMU", "NO")
+)
 
+
+#################################
+###### SEQUENCE PROCESSING ######
+#################################
+# In meters
+CUTOFF_DISTANCE = float(os.getenv(_ENV_PREFIX + "CUTOFF_DISTANCE", 600))
+# In seconds
+CUTOFF_TIME = float(os.getenv(_ENV_PREFIX + "CUTOFF_TIME", 60))
+DUPLICATE_DISTANCE = float(os.getenv(_ENV_PREFIX + "DUPLICATE_DISTANCE", 0.1))
+DUPLICATE_ANGLE = float(os.getenv(_ENV_PREFIX + "DUPLICATE_ANGLE", 5))
+MAX_AVG_SPEED = float(
+    os.getenv(_ENV_PREFIX + "MAX_AVG_SPEED", 400_000 / 3600)
+)  # 400 KM/h
 # WARNING: Changing the following envvars might result in failed uploads
 # Max number of images per sequence
 MAX_SEQUENCE_LENGTH = int(os.getenv(_ENV_PREFIX + "MAX_SEQUENCE_LENGTH", 1000))
@@ -67,27 +70,28 @@ def _yes_or_no(val: str) -> bool:
 # Max number of pixels per sequence (sum of image pixels in the sequence)
 MAX_SEQUENCE_PIXELS: str = os.getenv(_ENV_PREFIX + "MAX_SEQUENCE_PIXELS", "6G")
 
-PROMPT_DISABLED: bool = _yes_or_no(os.getenv(_ENV_PREFIX + "PROMPT_DISABLED", "NO"))
-
-_AUTH_VERIFICATION_DISABLED: bool = _yes_or_no(
-    os.getenv(_ENV_PREFIX + "_AUTH_VERIFICATION_DISABLED", "NO")
-)
 
+##################
+##### UPLOAD #####
+##################
 MAPILLARY_DISABLE_API_LOGGING: bool = _yes_or_no(
     os.getenv("MAPILLARY_DISABLE_API_LOGGING", "NO")
 )
+MAPILLARY_UPLOAD_HISTORY_PATH: str = os.getenv(
+    "MAPILLARY_UPLOAD_HISTORY_PATH", os.path.join(USER_DATA_DIR, "upload_history")
+)
+MAX_IMAGE_UPLOAD_WORKERS: int = int(
+    os.getenv(_ENV_PREFIX + "MAX_IMAGE_UPLOAD_WORKERS", 64)
+)
+# The chunk size in MB (see chunked transfer encoding https://en.wikipedia.org/wiki/Chunked_transfer_encoding)
+# for uploading data to MLY upload service.
+# Changing this size does not change the number of requests nor affect upload performance,
+# but it affects the responsiveness of the upload progress bar
+UPLOAD_CHUNK_SIZE_MB: float = float(os.getenv(_ENV_PREFIX + "UPLOAD_CHUNK_SIZE_MB", 1))
+MAX_UPLOAD_RETRIES: int = int(os.getenv(_ENV_PREFIX + "MAX_UPLOAD_RETRIES", 200))
 MAPILLARY__ENABLE_UPLOAD_HISTORY_FOR_DRY_RUN: bool = _yes_or_no(
     os.getenv("MAPILLARY__ENABLE_UPLOAD_HISTORY_FOR_DRY_RUN", "NO")
 )
-MAPILLARY__EXPERIMENTAL_ENABLE_IMU: bool = _yes_or_no(
-    os.getenv("MAPILLARY__EXPERIMENTAL_ENABLE_IMU", "NO")
-)
-MAPILLARY_UPLOAD_HISTORY_PATH: str = os.getenv(
-    "MAPILLARY_UPLOAD_HISTORY_PATH",
-    os.path.join(
-        USER_DATA_DIR,
-        "upload_history",
-    ),
+_AUTH_VERIFICATION_DISABLED: bool = _yes_or_no(
+    os.getenv(_ENV_PREFIX + "_AUTH_VERIFICATION_DISABLED", "NO")
 )
-
-MAX_IMAGE_UPLOAD_WORKERS = int(os.getenv(_ENV_PREFIX + "MAX_IMAGE_UPLOAD_WORKERS", 64))

From a4a8c4057949934f33fcd467fb77f7e90b687633 Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Thu, 3 Jul 2025 11:49:32 -0700
Subject: [PATCH 08/12] parse scaled values

---
 mapillary_tools/constants.py                  | 72 ++++++++++++++++++-
 .../process_sequence_properties.py            | 68 +++---------------
 2 files changed, 80 insertions(+), 60 deletions(-)

diff --git a/mapillary_tools/constants.py b/mapillary_tools/constants.py
index 858f14d00..ab6ce003d 100644
--- a/mapillary_tools/constants.py
+++ b/mapillary_tools/constants.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import functools
 import os
 
 import appdirs
@@ -11,6 +12,65 @@ def _yes_or_no(val: str) -> bool:
     return val.strip().upper() in ["1", "TRUE", "YES"]
 
 
+def _parse_scaled_integers(
+    value: str, scale: dict[str, int] | None = None
+) -> int | None:
+    """
+    >>> scale = {"": 1, "b": 1, "K": 1024, "M": 1024 * 1024, "G": 1024 * 1024 * 1024}
+    >>> _parse_scaled_integers("0", scale=scale)
+    0
+    >>> _parse_scaled_integers("10", scale=scale)
+    10
+    >>> _parse_scaled_integers("100B", scale=scale)
+    100
+    >>> _parse_scaled_integers("100k", scale=scale)
+    102400
+    >>> _parse_scaled_integers("100t", scale=scale)
+    Traceback (most recent call last):
+    ValueError: Expect valid integer ends with , b, K, M, G, but got 100T
+    """
+
+    if scale is None:
+        scale = {"": 1}
+
+    value = value.strip().upper()
+
+    if value in ["INF", "INFINITY"]:
+        return None
+
+    try:
+        for k, v in scale.items():
+            k = k.upper()
+            if k and value.endswith(k):
+                return int(value[: -len(k)]) * v
+
+        if "" in scale:
+            return int(value) * scale[""]
+    except ValueError:
+        pass
+
+    raise ValueError(
+        f"Expect valid integer ends with {', '.join(scale.keys())}, but got {value}"
+    )
+
+
+_parse_pixels = functools.partial(
+    _parse_scaled_integers,
+    scale={
+        "": 1,
+        "K": 1000,
+        "M": 1000 * 1000,
+        "MP": 1000 * 1000,
+        "G": 1000 * 1000 * 1000,
+        "GP": 1000 * 1000 * 1000,
+    },
+)
+
+_parse_filesize = functools.partial(
+    _parse_scaled_integers,
+    scale={"B": 1, "K": 1024, "M": 1024 * 1024, "G": 1024 * 1024 * 1024},
+)
+
 ###################
 ##### GENERAL #####
 ###################
@@ -64,11 +124,17 @@ def _yes_or_no(val: str) -> bool:
 )  # 400 KM/h
 # WARNING: Changing the following envvars might result in failed uploads
 # Max number of images per sequence
-MAX_SEQUENCE_LENGTH = int(os.getenv(_ENV_PREFIX + "MAX_SEQUENCE_LENGTH", 1000))
+MAX_SEQUENCE_LENGTH: int | None = _parse_scaled_integers(
+    os.getenv(_ENV_PREFIX + "MAX_SEQUENCE_LENGTH", "1000")
+)
 # Max file size per sequence (sum of image filesizes in the sequence)
-MAX_SEQUENCE_FILESIZE: str = os.getenv(_ENV_PREFIX + "MAX_SEQUENCE_FILESIZE", "110G")
+MAX_SEQUENCE_FILESIZE: int | None = _parse_filesize(
+    os.getenv(_ENV_PREFIX + "MAX_SEQUENCE_FILESIZE", "110G")
+)
 # Max number of pixels per sequence (sum of image pixels in the sequence)
-MAX_SEQUENCE_PIXELS: str = os.getenv(_ENV_PREFIX + "MAX_SEQUENCE_PIXELS", "6G")
+MAX_SEQUENCE_PIXELS: int | None = _parse_pixels(
+    os.getenv(_ENV_PREFIX + "MAX_SEQUENCE_PIXELS", "6G")
+)
 
 
 ##################
diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 97073164f..984381c57 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -182,50 +182,6 @@ def _interpolate_subsecs_for_sorting(sequence: PointSequence) -> None:
         )
 
 
-def _parse_filesize_in_bytes(filesize_str: str) -> int | None:
-    filesize_str = filesize_str.strip().upper()
-
-    if filesize_str in ["INF", "INFINITY"]:
-        return None
-
-    try:
-        if filesize_str.endswith("B"):
-            return int(filesize_str[:-1])
-        elif filesize_str.endswith("K"):
-            return int(filesize_str[:-1]) * 1024
-        elif filesize_str.endswith("M"):
-            return int(filesize_str[:-1]) * 1024 * 1024
-        elif filesize_str.endswith("G"):
-            return int(filesize_str[:-1]) * 1024 * 1024 * 1024
-        else:
-            return int(filesize_str)
-    except ValueError:
-        raise exceptions.MapillaryBadParameterError(
-            f"Expect valid file size that ends with B, K, M, or G, but got {filesize_str}"
-        )
-
-
-def _parse_pixels(pixels_str: str) -> int | None:
-    pixels_str = pixels_str.strip().upper()
-
-    if pixels_str in ["INF", "INFINITY"]:
-        return None
-
-    try:
-        if pixels_str.endswith("K"):
-            return int(pixels_str[:-1]) * 1000
-        elif pixels_str.endswith("M"):
-            return int(pixels_str[:-1]) * 1000 * 1000
-        elif pixels_str.endswith("G"):
-            return int(pixels_str[:-1]) * 1000 * 1000 * 1000
-        else:
-            return int(pixels_str)
-    except ValueError:
-        raise exceptions.MapillaryBadParameterError(
-            f"Expect valid number of pixels that ends with K, M, or G, but got {pixels_str}"
-        )
-
-
 def _avg_speed(sequence: T.Sequence[geo.PointLike]) -> float:
     total_distance = 0.0
     for cur, nxt in geo.pairwise(sequence):
@@ -379,7 +335,7 @@ def _check_sequences_by_limits(
 
     if output_errors:
         LOG.info(
-            "Found %s sequences and %s errors after sequence limit checks",
+            "Found %s image sequences and %s errors after sequence limit checks",
             len(output_sequences),
             len(output_errors),
         )
@@ -401,11 +357,11 @@ def _group_by_folder_and_camera(
         ),
     )
     for key in grouped:
-        LOG.debug("Group sequences by %s: %s images", key, len(grouped[key]))
+        LOG.debug("Group image sequences by %s: %s images", key, len(grouped[key]))
     output_sequences = list(grouped.values())
 
     LOG.info(
-        "Found %s sequences from different folders and cameras",
+        "Found %s image sequences from different folders and cameras",
         len(output_sequences),
     )
 
@@ -436,7 +392,7 @@ def _check_sequences_duplication(
 
     if output_errors:
         LOG.info(
-            "Found %s sequences and %s errors after duplication check",
+            "Found %s image sequences and %s errors after duplication check",
             len(output_sequences),
             len(output_errors),
         )
@@ -461,7 +417,7 @@ def _should_split_by_max_sequence_images(
         new_sequence_images = state.get("sequence_images", 0) + 1
         split = max_sequence_images < new_sequence_images
         if split:
-            LOG.debug(f"Split because {new_sequence_images=} < {max_sequence_images=}")
+            LOG.info(f"Split because {new_sequence_images=} < {max_sequence_images=}")
 
     if split:
         new_sequence_images = 1
@@ -483,7 +439,7 @@ def _should_split_by_cutoff_time(
             diff = image.time - last_image.time
             split = cutoff_time < diff
             if split:
-                LOG.debug(f"Split because {cutoff_time=}  < {diff=}")
+                LOG.info(f"Split because {cutoff_time=:.3f} < {diff=:.3f}")
 
     state["image"] = image
 
@@ -504,7 +460,7 @@ def _should_split_by_cutoff_distance(
             )
             split = cutoff_distance < diff
             if split:
-                LOG.debug(f"Split because {cutoff_distance=} < {diff=}")
+                LOG.info(f"Split because {cutoff_distance=:.3f} < {diff=:.3f}")
 
     state["image"] = image
 
@@ -631,7 +587,7 @@ def _should_split_agg(
 
     if len(input_sequences) != len(output_sequences):
         LOG.info(
-            f"Split {len(input_sequences)} into {len(output_sequences)} sequences by limits"
+            f"Split {len(input_sequences)} into {len(output_sequences)} image sequences by limits"
         )
 
     return output_sequences
@@ -646,10 +602,8 @@ def process_sequence_properties(
     duplicate_angle: float = constants.DUPLICATE_ANGLE,
     max_avg_speed: float = constants.MAX_AVG_SPEED,
 ) -> list[types.MetadataOrError]:
-    max_sequence_filesize_in_bytes = _parse_filesize_in_bytes(
-        constants.MAX_SEQUENCE_FILESIZE
-    )
-    max_sequence_pixels = _parse_pixels(constants.MAX_SEQUENCE_PIXELS)
+    max_sequence_filesize_in_bytes = constants.MAX_SEQUENCE_FILESIZE
+    max_sequence_pixels = constants.MAX_SEQUENCE_PIXELS
 
     error_metadatas: list[types.ErrorMetadata] = []
     image_metadatas: list[types.ImageMetadata] = []
@@ -749,7 +703,7 @@ def process_sequence_properties(
     results = error_metadatas + image_metadatas + video_metadatas
 
     assert len(metadatas) == len(results), (
-        f"expected {len(metadatas)} results but got {len(results)}"
+        f"Expected {len(metadatas)} results but got {len(results)}"
     )
 
     return results

From f2c8d0a6e803063bfd720013df67ab093cbd5dc8 Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Thu, 3 Jul 2025 14:23:34 -0700
Subject: [PATCH 09/12] logging

---
 mapillary_tools/geo.py                        | 16 ++++
 .../process_sequence_properties.py            | 90 +++++++------------
 2 files changed, 48 insertions(+), 58 deletions(-)

diff --git a/mapillary_tools/geo.py b/mapillary_tools/geo.py
index 5a6bb8d12..1e5fa3893 100644
--- a/mapillary_tools/geo.py
+++ b/mapillary_tools/geo.py
@@ -51,6 +51,22 @@ def gps_distance(latlon_1: tuple[float, float], latlon_2: tuple[float, float]) -
     return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2 + (z1 - z2) ** 2)
 
 
+def avg_speed(sequence: T.Sequence[PointLike]) -> float:
+    total_distance = 0.0
+    for cur, nxt in pairwise(sequence):
+        total_distance += gps_distance((cur.lat, cur.lon), (nxt.lat, nxt.lon))
+
+    if sequence:
+        time_diff = sequence[-1].time - sequence[0].time
+    else:
+        time_diff = 0.0
+
+    if time_diff == 0.0:
+        return float("inf")
+
+    return total_distance / time_diff
+
+
 def compute_bearing(
     latlon_1: tuple[float, float],
     latlon_2: tuple[float, float],
diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 984381c57..396759691 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -100,10 +100,7 @@ def duplication_check(
 
     for cur in it:
         # invariant: prev is processed
-        distance = geo.gps_distance(
-            (prev.lat, prev.lon),
-            (cur.lat, cur.lon),
-        )
+        distance = geo.gps_distance((prev.lat, prev.lon), (cur.lat, cur.lon))
 
         if prev.angle is not None and cur.angle is not None:
             angle_diff = geo.diff_bearing(prev.angle, cur.angle)
@@ -182,25 +179,6 @@ def _interpolate_subsecs_for_sorting(sequence: PointSequence) -> None:
         )
 
 
-def _avg_speed(sequence: T.Sequence[geo.PointLike]) -> float:
-    total_distance = 0.0
-    for cur, nxt in geo.pairwise(sequence):
-        total_distance += geo.gps_distance(
-            (cur.lat, cur.lon),
-            (nxt.lat, nxt.lon),
-        )
-
-    if sequence:
-        time_diff = sequence[-1].time - sequence[0].time
-    else:
-        time_diff = 0.0
-
-    if time_diff == 0.0:
-        return float("inf")
-
-    return total_distance / time_diff
-
-
 def _is_video_stationary(
     sequence: T.Sequence[geo.PointLike], max_radius_in_meters: float
 ) -> bool:
@@ -250,12 +228,12 @@ def _check_video_limits(
             )
             if contains_null_island:
                 raise exceptions.MapillaryNullIslandError(
-                    "Found GPS coordinates in Null Island (0, 0)",
+                    "GPS coordinates in Null Island (0, 0)"
                 )
 
             too_fast = (
                 len(video_metadata.points) >= 2
-                and _avg_speed(video_metadata.points) > max_avg_speed
+                and geo.avg_speed(video_metadata.points) > max_avg_speed
             )
             if too_fast:
                 raise exceptions.MapillaryCaptureSpeedTooFastError(
@@ -274,9 +252,7 @@ def _check_video_limits(
 
     if error_metadatas:
         LOG.info(
-            "Found %s videos and %s errors after video limit checks",
-            len(output_video_metadatas),
-            len(error_metadatas),
+            f"Video validation: {len(output_video_metadatas)} valid, {len(error_metadatas)} errors"
         )
 
     return output_video_metadatas, error_metadatas
@@ -291,15 +267,14 @@ def _check_sequences_by_limits(
     output_errors: list[types.ErrorMetadata] = []
 
     for sequence in input_sequences:
-        sequence_filesize = sum(
-            utils.get_file_size(image.filename)
-            if image.filesize is None
-            else image.filesize
-            for image in sequence
-        )
-
         try:
             if max_sequence_filesize_in_bytes is not None:
+                sequence_filesize = sum(
+                    utils.get_file_size(image.filename)
+                    if image.filesize is None
+                    else image.filesize
+                    for image in sequence
+                )
                 if sequence_filesize > max_sequence_filesize_in_bytes:
                     raise exceptions.MapillaryFileTooLargeError(
                         f"Sequence file size exceeds the maximum allowed file size ({max_sequence_filesize_in_bytes} bytes)",
@@ -310,10 +285,10 @@ def _check_sequences_by_limits(
             )
             if contains_null_island:
                 raise exceptions.MapillaryNullIslandError(
-                    "Found GPS coordinates in Null Island (0, 0)",
+                    "GPS coordinates in Null Island (0, 0)"
                 )
 
-            too_fast = len(sequence) >= 2 and _avg_speed(sequence) > max_avg_speed
+            too_fast = len(sequence) >= 2 and geo.avg_speed(sequence) > max_avg_speed
             if too_fast:
                 raise exceptions.MapillaryCaptureSpeedTooFastError(
                     f"Capture speed too fast (exceeds {round(max_avg_speed, 3)} m/s)",
@@ -335,9 +310,7 @@ def _check_sequences_by_limits(
 
     if output_errors:
         LOG.info(
-            "Found %s image sequences and %s errors after sequence limit checks",
-            len(output_sequences),
-            len(output_errors),
+            f"Sequence validation: {output_sequences} valid, {len(output_errors)} errors"
         )
 
     return output_sequences, output_errors
@@ -357,13 +330,10 @@ def _group_by_folder_and_camera(
         ),
     )
     for key in grouped:
-        LOG.debug("Group image sequences by %s: %s images", key, len(grouped[key]))
+        LOG.debug(f"Grouped {len(grouped[key])} images by {key}")
     output_sequences = list(grouped.values())
 
-    LOG.info(
-        "Found %s image sequences from different folders and cameras",
-        len(output_sequences),
-    )
+    LOG.info(f"Created {len(output_sequences)} sequences by folders and cameras")
 
     return output_sequences
 
@@ -392,9 +362,7 @@ def _check_sequences_duplication(
 
     if output_errors:
         LOG.info(
-            "Found %s image sequences and %s errors after duplication check",
-            len(output_sequences),
-            len(output_errors),
+            f"Duplication check: {len(output_sequences)} sequences with {len(output_errors)} image duplicates removed"
         )
 
     return output_sequences, output_errors
@@ -409,7 +377,7 @@ class SplitState(T.TypedDict, total=False):
 
 def _should_split_by_max_sequence_images(
     state: SplitState,
-    _: types.ImageMetadata,
+    image: types.ImageMetadata,
     max_sequence_images: int,
     split: bool = False,
 ) -> tuple[SplitState, bool]:
@@ -417,7 +385,9 @@ def _should_split_by_max_sequence_images(
         new_sequence_images = state.get("sequence_images", 0) + 1
         split = max_sequence_images < new_sequence_images
         if split:
-            LOG.info(f"Split because {new_sequence_images=} < {max_sequence_images=}")
+            LOG.info(
+                f"Split sequence at {image.filename.name}: too many images ({new_sequence_images} > {max_sequence_images})"
+            )
 
     if split:
         new_sequence_images = 1
@@ -439,7 +409,9 @@ def _should_split_by_cutoff_time(
             diff = image.time - last_image.time
             split = cutoff_time < diff
             if split:
-                LOG.info(f"Split because {cutoff_time=:.3f} < {diff=:.3f}")
+                LOG.info(
+                    f"Split sequence at {image.filename.name}: time gap too large ({diff:.6g} seconds > {cutoff_time:.6g} seconds)"
+                )
 
     state["image"] = image
 
@@ -460,7 +432,9 @@ def _should_split_by_cutoff_distance(
             )
             split = cutoff_distance < diff
             if split:
-                LOG.info(f"Split because {cutoff_distance=:.3f} < {diff=:.3f}")
+                LOG.info(
+                    f"Split sequence at {image.filename.name}: distance gap too large ({diff:.6g} meters > {cutoff_distance:.6g} meters)"
+                )
 
     state["image"] = image
 
@@ -482,8 +456,8 @@ def _should_split_by_max_sequence_filesize(
         new_sequence_file_size = state.get("sequence_file_size", 0) + filesize
         split = max_sequence_filesize_in_bytes < new_sequence_file_size
         if split:
-            LOG.debug(
-                f"Split because {max_sequence_filesize_in_bytes=} < {new_sequence_file_size=}"
+            LOG.info(
+                f"Split sequence at {image.filename.name}: filesize too large ({new_sequence_file_size} > {max_sequence_filesize_in_bytes})"
             )
 
     if split:
@@ -509,7 +483,9 @@ def _should_split_by_max_sequence_pixels(
         new_sequence_pixels = state.get("sequence_pixels", 0) + pixels
         split = max_sequence_pixels < new_sequence_pixels
         if split:
-            LOG.debug(f"Split because {max_sequence_pixels=} < {new_sequence_pixels=}")
+            LOG.info(
+                f"Split sequence at {image.filename.name}: pixels too large ({max_sequence_pixels} < {new_sequence_pixels})"
+            )
 
     if split:
         new_sequence_pixels = pixels
@@ -586,9 +562,7 @@ def _should_split_agg(
     assert sum(len(s) for s in output_sequences) == sum(len(s) for s in input_sequences)
 
     if len(input_sequences) != len(output_sequences):
-        LOG.info(
-            f"Split {len(input_sequences)} into {len(output_sequences)} image sequences by limits"
-        )
+        LOG.info(f"Split sequences: {len(input_sequences)} -> {len(output_sequences)}")
 
     return output_sequences
 

From 3bc13a3bb4331e44353e0aec6cceee7d67b0c23c Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Thu, 3 Jul 2025 14:33:17 -0700
Subject: [PATCH 10/12] logging

---
 .../process_sequence_properties.py            | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 396759691..0734e2d63 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -111,15 +111,14 @@ def duplication_check(
             angle_diff is None or angle_diff <= max_duplicate_angle
         ):
             msg = f"Duplicate of its previous image in terms of distance <= {max_duplicate_distance} and angle <= {max_duplicate_angle}"
+            ex = exceptions.MapillaryDuplicationError(
+                msg,
+                DescriptionJSONSerializer.as_desc(cur),
+                distance=distance,
+                angle_diff=angle_diff,
+            )
             dup = types.describe_error_metadata(
-                exceptions.MapillaryDuplicationError(
-                    msg,
-                    DescriptionJSONSerializer.as_desc(cur),
-                    distance=distance,
-                    angle_diff=angle_diff,
-                ),
-                cur.filename,
-                filetype=types.FileType.IMAGE,
+                ex, cur.filename, filetype=types.FileType.IMAGE
             )
             dups.append(dup)
             # prev does not change
@@ -353,7 +352,8 @@ def _check_sequences_duplication(
             max_duplicate_angle=duplicate_angle,
         )
         assert len(sequence) == len(output_sequence) + len(errors)
-        output_sequences.append(output_sequence)
+        if output_sequence:
+            output_sequences.append(output_sequence)
         output_errors.extend(errors)
 
     assert sum(len(s) for s in output_sequences) + len(output_errors) == sum(
@@ -362,7 +362,7 @@ def _check_sequences_duplication(
 
     if output_errors:
         LOG.info(
-            f"Duplication check: {len(output_sequences)} sequences with {len(output_errors)} image duplicates removed"
+            f"Duplication check: {len(output_errors)} image duplicates removed (with {duplicate_distance=} and {duplicate_angle=})"
         )
 
     return output_sequences, output_errors

From f2de99fc860618e17a0f80db6f444c4a1734bf42 Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Thu, 3 Jul 2025 14:51:57 -0700
Subject: [PATCH 11/12] add doctests

---
 .../process_sequence_properties.py            | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 0734e2d63..66db8581c 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -88,11 +88,17 @@ def split_sequence_by(
 def duplication_check(
     sequence: PointSequence, max_duplicate_distance: float, max_duplicate_angle: float
 ) -> tuple[PointSequence, list[types.ErrorMetadata]]:
+    """
+    >>> duplication_check([], 1, 2)
+    ([], [])
+    """
+
     dedups: PointSequence = []
     dups: list[types.ErrorMetadata] = []
 
     it = iter(sequence)
-    prev = next(it)
+    prev = next(it, None)
+
     if prev is None:
         return dedups, dups
 
@@ -144,9 +150,14 @@ def _interpolate_subsecs_for_sorting(sequence: PointSequence) -> None:
     """
     Update the timestamps make sure they are unique and sorted
     in the same order by interpolating subseconds
+
     Examples:
-    - Input: 1, 1, 1, 1, 1, 2
-    - Output: 1, 1.2, 1.4, 1.6, 1.8, 2
+        >>> def make_point(t):
+        ...     return geo.Point(lat=0, lon=0, time=t, alt=None, angle=None)
+        >>> points = [make_point(t) for t in [1, 1, 1, 1, 1, 2]]
+        >>> _interpolate_subsecs_for_sorting(points)
+        >>> [p.time for p in points]
+        [1.0, 1.2, 1.4, 1.6, 1.8, 2]
     """
 
     gidx = 0
@@ -309,7 +320,7 @@ def _check_sequences_by_limits(
 
     if output_errors:
         LOG.info(
-            f"Sequence validation: {output_sequences} valid, {len(output_errors)} errors"
+            f"Sequence validation: {len(output_sequences)} valid, {len(output_errors)} errors"
         )
 
     return output_sequences, output_errors
@@ -356,6 +367,7 @@ def _check_sequences_duplication(
             output_sequences.append(output_sequence)
         output_errors.extend(errors)
 
+    # All input images should be accounted for either in output sequences or errors
     assert sum(len(s) for s in output_sequences) + len(output_errors) == sum(
         len(s) for s in input_sequences
     )
@@ -484,7 +496,7 @@ def _should_split_by_max_sequence_pixels(
         split = max_sequence_pixels < new_sequence_pixels
         if split:
             LOG.info(
-                f"Split sequence at {image.filename.name}: pixels too large ({max_sequence_pixels} < {new_sequence_pixels})"
+                f"Split sequence at {image.filename.name}: pixels too large ({new_sequence_pixels} > {max_sequence_pixels})"
             )
 
     if split:

From 1ba3fa78b9b91c6711bfa7db3f93e0c6f5b8df60 Mon Sep 17 00:00:00 2001
From: Tao Peng <taopeng@meta.com>
Date: Thu, 3 Jul 2025 15:02:25 -0700
Subject: [PATCH 12/12] doctests

---
 .../process_sequence_properties.py            | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/mapillary_tools/process_sequence_properties.py b/mapillary_tools/process_sequence_properties.py
index 66db8581c..c3d7b1fe7 100644
--- a/mapillary_tools/process_sequence_properties.py
+++ b/mapillary_tools/process_sequence_properties.py
@@ -86,10 +86,13 @@ def split_sequence_by(
 
 
 def duplication_check(
-    sequence: PointSequence, max_duplicate_distance: float, max_duplicate_angle: float
+    sequence: PointSequence,
+    *,
+    max_duplicate_distance: float,
+    max_duplicate_angle: float,
 ) -> tuple[PointSequence, list[types.ErrorMetadata]]:
     """
-    >>> duplication_check([], 1, 2)
+    >>> duplication_check([], max_duplicate_distance=1, max_duplicate_angle=2)
     ([], [])
     """
 
@@ -136,9 +139,9 @@ def duplication_check(
     return dedups, dups
 
 
-def _group_by(
+def _group_images_by(
     image_metadatas: T.Iterable[types.ImageMetadata],
-    group_key_func=T.Callable[[types.ImageMetadata], T.Hashable],
+    group_key_func: T.Callable[[types.ImageMetadata], T.Hashable],
 ) -> dict[T.Hashable, list[types.ImageMetadata]]:
     grouped: dict[T.Hashable, list[types.ImageMetadata]] = {}
     for metadata in image_metadatas:
@@ -148,7 +151,7 @@ def _group_by(
 
 def _interpolate_subsecs_for_sorting(sequence: PointSequence) -> None:
     """
-    Update the timestamps make sure they are unique and sorted
+    Update the timestamps to make sure they are unique and sorted
     in the same order by interpolating subseconds
 
     Examples:
@@ -158,6 +161,11 @@ def _interpolate_subsecs_for_sorting(sequence: PointSequence) -> None:
         >>> _interpolate_subsecs_for_sorting(points)
         >>> [p.time for p in points]
         [1.0, 1.2, 1.4, 1.6, 1.8, 2]
+
+        >>> points = [make_point(t) for t in [1.1]]
+        >>> _interpolate_subsecs_for_sorting(points)
+        >>> [p.time for p in points]
+        [1.1]
     """
 
     gidx = 0
@@ -329,7 +337,7 @@ def _check_sequences_by_limits(
 def _group_by_folder_and_camera(
     image_metadatas: list[types.ImageMetadata],
 ) -> list[list[types.ImageMetadata]]:
-    grouped = _group_by(
+    grouped = _group_images_by(
         image_metadatas,
         lambda metadata: (
             str(metadata.filename.parent),
@@ -486,7 +494,7 @@ def _should_split_by_max_sequence_pixels(
     max_sequence_pixels: int,
     split: bool = False,
 ) -> tuple[SplitState, bool]:
-    # Decent default values if width/height not available
+    # Default values if width/height not available
     width = 1024 if image.width is None else image.width
     height = 1024 if image.height is None else image.height
     pixels = width * height
@@ -665,7 +673,7 @@ def process_sequence_properties(
         error_metadatas.extend(errors)
 
         # Split sequences by cutoff distance
-        # NOTE: The speed limit check probably rejects most of anomalies
+        # NOTE: The speed limit check probably rejects most anomalies
         sequences = _split_sequences_by_limits(
             sequences, cutoff_distance=cutoff_distance
         )