Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 42 additions & 21 deletions doctr/models/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,16 @@ def estimate_orientation(
Returns:
the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation)
"""
assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
thresh = None
assert len(img.shape) == 3 and img.shape[-1] in [1, 3], (
f"Image shape {img.shape} not supported"
)
# Convert image to grayscale if necessary
if img.shape[-1] == 3:
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray_img = cv2.medianBlur(gray_img, 5)
thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
thresh = cv2.threshold(
gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
)[1]
else:
thresh = img.astype(np.uint8)

Expand All @@ -67,33 +70,40 @@ def estimate_orientation(
# We rotate the image to the general orientation which improves the detection
# No expand needed bitmap is already padded
thresh = rotate_image(thresh, -page_orientation)
else: # That's only required if we do not work on the detection models bin map
else:
# try to merge words in lines
(h, w) = img.shape[:2]
k_x = max(1, (floor(w / 100)))
k_y = max(1, (floor(h / 100)))
h, w = img.shape[:2]
k_x = max(1, floor(w / 100))
k_y = max(1, floor(h / 100))
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
thresh = cv2.dilate(thresh, kernel, iterations=1)

# extract contours
contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

# Filter & Sort contours
contours = sorted(
[contour for contour in contours if cv2.contourArea(contour) > lower_area],
key=get_max_width_length_ratio,
reverse=True,
)
# Filter & Sort contours in one pass
filtered_contours = [
contour for contour in contours if cv2.contourArea(contour) > lower_area
]
if filtered_contours:
contours_sorted = sorted(
filtered_contours, key=get_max_width_length_ratio, reverse=True
)
else:
contours_sorted = []

angles = []
for contour in contours[:n_ct]:
for contour in contours_sorted[:n_ct]:
_, (w, h), angle = cv2.minAreaRect(contour)
if w / h > ratio_threshold_for_lines: # select only contours with ratio like lines
if h == 0:
continue # avoid division by zero
ratio = w / h
if ratio > ratio_threshold_for_lines:
angles.append(angle)
elif w / h < 1 / ratio_threshold_for_lines: # if lines are vertical, substract 90 degree
elif ratio < 1 / ratio_threshold_for_lines:
angles.append(angle - 90)

if len(angles) == 0:
if not angles:
estimated_angle = 0 # in case no angles is found
else:
median = -median_low(angles)
Expand All @@ -107,7 +117,11 @@ def estimate_orientation(
# so in this case we prefer the general page orientation
if abs(estimated_angle) == abs(page_orientation):
return page_orientation
estimated_angle = estimated_angle if page_orientation == 0 else page_orientation + estimated_angle
estimated_angle = (
estimated_angle
if page_orientation == 0
else page_orientation + estimated_angle
)
if estimated_angle > 180:
estimated_angle -= 360

Expand All @@ -127,7 +141,10 @@ def rectify_crops(
# Inverse predictions (if angle of +90 is detected, rotate by -90)
orientations = [4 - pred if pred != 0 else 0 for pred in orientations]
return (
[crop if orientation == 0 else np.rot90(crop, orientation) for orientation, crop in zip(orientations, crops)]
[
crop if orientation == 0 else np.rot90(crop, orientation)
for orientation, crop in zip(orientations, crops)
]
if len(orientations) > 0
else []
)
Expand Down Expand Up @@ -184,9 +201,13 @@ def invert_data_structure(
dictionary of list when x is a list of dictionaries or a list of dictionaries when x is dictionary of lists
"""
if isinstance(x, dict):
assert len({len(v) for v in x.values()}) == 1, "All the lists in the dictionary should have the same length."
assert len({len(v) for v in x.values()}) == 1, (
"All the lists in the dictionary should have the same length."
)
return [dict(zip(x, t)) for t in zip(*x.values())]
elif isinstance(x, list):
return {k: [dic[k] for dic in x] for k in x[0]}
else:
raise TypeError(f"Expected input to be either a dict or a list, got {type(input)} instead.")
raise TypeError(
f"Expected input to be either a dict or a list, got {type(input)} instead."
)
151 changes: 112 additions & 39 deletions doctr/utils/geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ def _detach(boxes: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
return list(loc_preds), list(obj_scores)


def resolve_enclosing_bbox(bboxes: list[BoundingBox] | np.ndarray) -> BoundingBox | np.ndarray:
def resolve_enclosing_bbox(
bboxes: list[BoundingBox] | np.ndarray,
) -> BoundingBox | np.ndarray:
"""Compute enclosing bbox either from:

Args:
Expand All @@ -96,7 +98,9 @@ def resolve_enclosing_bbox(bboxes: list[BoundingBox] | np.ndarray) -> BoundingBo
return (min(x), min(y)), (max(x), max(y))


def resolve_enclosing_rbbox(rbboxes: list[np.ndarray], intermed_size: int = 1024) -> np.ndarray:
def resolve_enclosing_rbbox(
rbboxes: list[np.ndarray], intermed_size: int = 1024
) -> np.ndarray:
"""Compute enclosing rotated bbox either from:

Args:
Expand Down Expand Up @@ -130,7 +134,11 @@ def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray:
"""
angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions
rotation_mat = np.array(
[[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=points.dtype
[
[np.cos(angle_rad), -np.sin(angle_rad)],
[np.sin(angle_rad), np.cos(angle_rad)],
],
dtype=points.dtype,
)
return np.matmul(points, rotation_mat.T)

Expand All @@ -145,10 +153,12 @@ def compute_expanded_shape(img_shape: tuple[int, int], angle: float) -> tuple[in
Returns:
the height and width of the rotated image
"""
points: np.ndarray = np.array([
[img_shape[1] / 2, img_shape[0] / 2],
[-img_shape[1] / 2, img_shape[0] / 2],
])
points: np.ndarray = np.array(
[
[img_shape[1] / 2, img_shape[0] / 2],
[-img_shape[1] / 2, img_shape[0] / 2],
]
)

rotated_points = rotate_abs_points(points, angle)

Expand Down Expand Up @@ -176,7 +186,10 @@ def rotate_abs_geoms(
"""
# Switch to polygons
polys = (
np.stack([geoms[:, [0, 1]], geoms[:, [2, 1]], geoms[:, [2, 3]], geoms[:, [0, 3]]], axis=1)
np.stack(
[geoms[:, [0, 1]], geoms[:, [2, 1]], geoms[:, [2, 3]], geoms[:, [0, 3]]],
axis=1,
)
if geoms.ndim == 2
else geoms
)
Expand All @@ -191,13 +204,19 @@ def rotate_abs_geoms(
# Switch back to top-left corner as referential
target_shape = compute_expanded_shape(img_shape, angle) if expand else img_shape
# Clip coords to fit since there is no expansion
rotated_polys[..., 0] = (rotated_polys[..., 0] + target_shape[1] / 2).clip(0, target_shape[1])
rotated_polys[..., 1] = (target_shape[0] / 2 - rotated_polys[..., 1]).clip(0, target_shape[0])
rotated_polys[..., 0] = (rotated_polys[..., 0] + target_shape[1] / 2).clip(
0, target_shape[1]
)
rotated_polys[..., 1] = (target_shape[0] / 2 - rotated_polys[..., 1]).clip(
0, target_shape[0]
)

return rotated_polys


def remap_boxes(loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape: tuple[int, int]) -> np.ndarray:
def remap_boxes(
loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape: tuple[int, int]
) -> np.ndarray:
"""Remaps a batch of rotated locpred (N, 4, 2) expressed for an origin_shape to a destination_shape.
This does not impact the absolute shape of the boxes, but allow to calculate the new relative RotatedBbox
coordinates after a resizing of the image.
Expand All @@ -213,12 +232,18 @@ def remap_boxes(loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape:
if len(dest_shape) != 2:
raise ValueError(f"Mask length should be 2, was found at: {len(dest_shape)}")
if len(orig_shape) != 2:
raise ValueError(f"Image_shape length should be 2, was found at: {len(orig_shape)}")
raise ValueError(
f"Image_shape length should be 2, was found at: {len(orig_shape)}"
)
orig_height, orig_width = orig_shape
dest_height, dest_width = dest_shape
mboxes = loc_preds.copy()
mboxes[:, :, 0] = ((loc_preds[:, :, 0] * orig_width) + (dest_width - orig_width) / 2) / dest_width
mboxes[:, :, 1] = ((loc_preds[:, :, 1] * orig_height) + (dest_height - orig_height) / 2) / dest_height
mboxes[:, :, 0] = (
(loc_preds[:, :, 0] * orig_width) + (dest_width - orig_width) / 2
) / dest_width
mboxes[:, :, 1] = (
(loc_preds[:, :, 1] * orig_height) + (dest_height - orig_height) / 2
) / dest_height

return mboxes

Expand Down Expand Up @@ -263,19 +288,31 @@ def rotate_boxes(
# Compute rotation matrix
angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions
rotation_mat = np.array(
[[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=_boxes.dtype
[
[np.cos(angle_rad), -np.sin(angle_rad)],
[np.sin(angle_rad), np.cos(angle_rad)],
],
dtype=_boxes.dtype,
)
# Rotate absolute points
points: np.ndarray = np.stack((_boxes[:, :, 0] * orig_shape[1], _boxes[:, :, 1] * orig_shape[0]), axis=-1)
points: np.ndarray = np.stack(
(_boxes[:, :, 0] * orig_shape[1], _boxes[:, :, 1] * orig_shape[0]), axis=-1
)
image_center = (orig_shape[1] / 2, orig_shape[0] / 2)
rotated_points = image_center + np.matmul(points - image_center, rotation_mat)
rotated_boxes: np.ndarray = np.stack(
(rotated_points[:, :, 0] / orig_shape[1], rotated_points[:, :, 1] / orig_shape[0]), axis=-1
(
rotated_points[:, :, 0] / orig_shape[1],
rotated_points[:, :, 1] / orig_shape[0],
),
axis=-1,
)

# Apply a mask if requested
if target_shape is not None:
rotated_boxes = remap_boxes(rotated_boxes, orig_shape=orig_shape, dest_shape=target_shape)
rotated_boxes = remap_boxes(
rotated_boxes, orig_shape=orig_shape, dest_shape=target_shape
)

return rotated_boxes

Expand All @@ -297,34 +334,59 @@ def rotate_image(
Returns:
Rotated array, padded by 0 by default.
"""
# Early exit for zero angle and no padding/resize requested
if angle == 0 and not expand and not preserve_origin_shape:
return image

# Compute the expanded padding
exp_img: np.ndarray
if expand:
exp_shape = compute_expanded_shape(image.shape[:2], angle)
h_pad, w_pad = (
int(max(0, ceil(exp_shape[0] - image.shape[0]))),
int(max(0, ceil(exp_shape[1] - image.shape[1]))),
exp_h, exp_w = exp_shape
h, w = image.shape[:2]

h_pad = int(max(0, ceil(exp_h - h)))
w_pad = int(max(0, ceil(exp_w - w)))
pad_spec = (
(h_pad // 2, h_pad - h_pad // 2),
(w_pad // 2, w_pad - w_pad // 2),
(0, 0),
)
exp_img = np.pad(image, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
exp_img = np.pad(image, pad_spec)
src_h, src_w = exp_img.shape[:2]
else:
exp_img = image
src_h, src_w = exp_img.shape[:2]

rot_mat = cv2.getRotationMatrix2D((src_w / 2, src_h / 2), angle, 1.0)
rot_img = cv2.warpAffine(exp_img, rot_mat, (src_w, src_h))

height, width = exp_img.shape[:2]
rot_mat = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1.0)
rot_img = cv2.warpAffine(exp_img, rot_mat, (width, height))
if expand:
# Pad to get the same aspect ratio
if (image.shape[0] / image.shape[1]) != (rot_img.shape[0] / rot_img.shape[1]):
# Pad to get the same aspect ratio if needed
orig_h, orig_w = image.shape[:2]
orig_ratio = orig_h / orig_w
rot_ratio = rot_img.shape[0] / rot_img.shape[1]
if orig_ratio != rot_ratio:
# Pad width
if (rot_img.shape[0] / rot_img.shape[1]) > (image.shape[0] / image.shape[1]):
h_pad, w_pad = 0, int(rot_img.shape[0] * image.shape[1] / image.shape[0] - rot_img.shape[1])
if rot_ratio > orig_ratio:
new_w = int(rot_img.shape[0] * orig_w / orig_h)
w_pad = new_w - rot_img.shape[1]
h_pad = 0
# Pad height
else:
h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0
rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0)))
new_h = int(rot_img.shape[1] * orig_h / orig_w)
h_pad = new_h - rot_img.shape[0]
w_pad = 0
pad_spec = (
(h_pad // 2, h_pad - h_pad // 2),
(w_pad // 2, w_pad - w_pad // 2),
(0, 0),
)
rot_img = np.pad(rot_img, pad_spec)

if preserve_origin_shape:
# rescale
rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR)
rot_img = cv2.resize(
rot_img, (orig_w, orig_h), interpolation=cv2.INTER_LINEAR
)

return rot_img

Expand Down Expand Up @@ -359,13 +421,17 @@ def estimate_page_angle(polys: np.ndarray) -> float:
with np.errstate(divide="raise", invalid="raise"):
try:
return float(
np.median(np.arctan((yleft - yright) / (xright - xleft)) * 180 / np.pi) # Y axis from top to bottom!
np.median(
np.arctan((yleft - yright) / (xright - xleft)) * 180 / np.pi
) # Y axis from top to bottom!
)
except FloatingPointError:
return 0.0


def convert_to_relative_coords(geoms: np.ndarray, img_shape: tuple[int, int]) -> np.ndarray:
def convert_to_relative_coords(
geoms: np.ndarray, img_shape: tuple[int, int]
) -> np.ndarray:
"""Convert a geometry to relative coordinates

Args:
Expand Down Expand Up @@ -404,7 +470,9 @@ def extract_crops(img: np.ndarray, boxes: np.ndarray) -> list[np.ndarray]:
if boxes.shape[0] == 0:
return []
if boxes.shape[1] != 4:
raise AssertionError("boxes are expected to be relative and in order (xmin, ymin, xmax, ymax)")
raise AssertionError(
"boxes are expected to be relative and in order (xmin, ymin, xmax, ymax)"
)

# Project relative coordinates
_boxes = boxes.copy()
Expand All @@ -420,7 +488,10 @@ def extract_crops(img: np.ndarray, boxes: np.ndarray) -> list[np.ndarray]:


def extract_rcrops(
img: np.ndarray, polys: np.ndarray, dtype=np.float32, assume_horizontal: bool = False
img: np.ndarray,
polys: np.ndarray,
dtype=np.float32,
assume_horizontal: bool = False,
) -> list[np.ndarray]:
"""Created cropped images from list of rotated bounding boxes

Expand All @@ -436,7 +507,9 @@ def extract_rcrops(
if polys.shape[0] == 0:
return []
if polys.shape[1:] != (4, 2):
raise AssertionError("polys are expected to be quadrilateral, of shape (N, 4, 2)")
raise AssertionError(
"polys are expected to be quadrilateral, of shape (N, 4, 2)"
)

# Project relative coordinates
_boxes = polys.copy()
Expand Down