diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py index 2443ab0eca..c04a962436 100644 --- a/doctr/models/_utils.py +++ b/doctr/models/_utils.py @@ -52,13 +52,16 @@ def estimate_orientation( Returns: the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation) """ - assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported" - thresh = None + assert len(img.shape) == 3 and img.shape[-1] in [1, 3], ( + f"Image shape {img.shape} not supported" + ) # Convert image to grayscale if necessary if img.shape[-1] == 3: gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray_img = cv2.medianBlur(gray_img, 5) - thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + thresh = cv2.threshold( + gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU + )[1] else: thresh = img.astype(np.uint8) @@ -67,33 +70,40 @@ def estimate_orientation( # We rotate the image to the general orientation which improves the detection # No expand needed bitmap is already padded thresh = rotate_image(thresh, -page_orientation) - else: # That's only required if we do not work on the detection models bin map + else: # try to merge words in lines - (h, w) = img.shape[:2] - k_x = max(1, (floor(w / 100))) - k_y = max(1, (floor(h / 100))) + h, w = img.shape[:2] + k_x = max(1, floor(w / 100)) + k_y = max(1, floor(h / 100)) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y)) thresh = cv2.dilate(thresh, kernel, iterations=1) # extract contours contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) - # Filter & Sort contours - contours = sorted( - [contour for contour in contours if cv2.contourArea(contour) > lower_area], - key=get_max_width_length_ratio, - reverse=True, - ) + # Filter & Sort contours in one pass + filtered_contours = [ + contour for contour in contours if cv2.contourArea(contour) > lower_area + ] + if filtered_contours: + contours_sorted = sorted( + filtered_contours, key=get_max_width_length_ratio, reverse=True + ) + else: + contours_sorted = [] angles = [] - for contour in contours[:n_ct]: + for contour in contours_sorted[:n_ct]: _, (w, h), angle = cv2.minAreaRect(contour) - if w / h > ratio_threshold_for_lines: # select only contours with ratio like lines + if h == 0: + continue # avoid division by zero + ratio = w / h + if ratio > ratio_threshold_for_lines: angles.append(angle) - elif w / h < 1 / ratio_threshold_for_lines: # if lines are vertical, substract 90 degree + elif ratio < 1 / ratio_threshold_for_lines: angles.append(angle - 90) - if len(angles) == 0: + if not angles: estimated_angle = 0 # in case no angles is found else: median = -median_low(angles) @@ -107,7 +117,11 @@ def estimate_orientation( # so in this case we prefer the general page orientation if abs(estimated_angle) == abs(page_orientation): return page_orientation - estimated_angle = estimated_angle if page_orientation == 0 else page_orientation + estimated_angle + estimated_angle = ( + estimated_angle + if page_orientation == 0 + else page_orientation + estimated_angle + ) if estimated_angle > 180: estimated_angle -= 360 @@ -127,7 +141,10 @@ def rectify_crops( # Inverse predictions (if angle of +90 is detected, rotate by -90) orientations = [4 - pred if pred != 0 else 0 for pred in orientations] return ( - [crop if orientation == 0 else np.rot90(crop, orientation) for orientation, crop in zip(orientations, crops)] + [ + crop if orientation == 0 else np.rot90(crop, orientation) + for orientation, crop in zip(orientations, crops) + ] if len(orientations) > 0 else [] ) @@ -184,9 +201,13 @@ def invert_data_structure( dictionary of list when x is a list of dictionaries or a list of dictionaries when x is dictionary of lists """ if isinstance(x, dict): - assert len({len(v) for v in x.values()}) == 1, "All the lists in the dictionary should have the same length." + assert len({len(v) for v in x.values()}) == 1, ( + "All the lists in the dictionary should have the same length." + ) return [dict(zip(x, t)) for t in zip(*x.values())] elif isinstance(x, list): return {k: [dic[k] for dic in x] for k in x[0]} else: - raise TypeError(f"Expected input to be either a dict or a list, got {type(input)} instead.") + raise TypeError( + f"Expected input to be either a dict or a list, got {type(input)} instead." + ) diff --git a/doctr/utils/geometry.py b/doctr/utils/geometry.py index a5548bcc61..2d0ff0d82b 100644 --- a/doctr/utils/geometry.py +++ b/doctr/utils/geometry.py @@ -74,7 +74,9 @@ def _detach(boxes: np.ndarray) -> tuple[np.ndarray, np.ndarray]: return list(loc_preds), list(obj_scores) -def resolve_enclosing_bbox(bboxes: list[BoundingBox] | np.ndarray) -> BoundingBox | np.ndarray: +def resolve_enclosing_bbox( + bboxes: list[BoundingBox] | np.ndarray, +) -> BoundingBox | np.ndarray: """Compute enclosing bbox either from: Args: @@ -96,7 +98,9 @@ def resolve_enclosing_bbox(bboxes: list[BoundingBox] | np.ndarray) -> BoundingBo return (min(x), min(y)), (max(x), max(y)) -def resolve_enclosing_rbbox(rbboxes: list[np.ndarray], intermed_size: int = 1024) -> np.ndarray: +def resolve_enclosing_rbbox( + rbboxes: list[np.ndarray], intermed_size: int = 1024 +) -> np.ndarray: """Compute enclosing rotated bbox either from: Args: @@ -130,7 +134,11 @@ def rotate_abs_points(points: np.ndarray, angle: float = 0.0) -> np.ndarray: """ angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions rotation_mat = np.array( - [[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=points.dtype + [ + [np.cos(angle_rad), -np.sin(angle_rad)], + [np.sin(angle_rad), np.cos(angle_rad)], + ], + dtype=points.dtype, ) return np.matmul(points, rotation_mat.T) @@ -145,10 +153,12 @@ def compute_expanded_shape(img_shape: tuple[int, int], angle: float) -> tuple[in Returns: the height and width of the rotated image """ - points: np.ndarray = np.array([ - [img_shape[1] / 2, img_shape[0] / 2], - [-img_shape[1] / 2, img_shape[0] / 2], - ]) + points: np.ndarray = np.array( + [ + [img_shape[1] / 2, img_shape[0] / 2], + [-img_shape[1] / 2, img_shape[0] / 2], + ] + ) rotated_points = rotate_abs_points(points, angle) @@ -176,7 +186,10 @@ def rotate_abs_geoms( """ # Switch to polygons polys = ( - np.stack([geoms[:, [0, 1]], geoms[:, [2, 1]], geoms[:, [2, 3]], geoms[:, [0, 3]]], axis=1) + np.stack( + [geoms[:, [0, 1]], geoms[:, [2, 1]], geoms[:, [2, 3]], geoms[:, [0, 3]]], + axis=1, + ) if geoms.ndim == 2 else geoms ) @@ -191,13 +204,19 @@ def rotate_abs_geoms( # Switch back to top-left corner as referential target_shape = compute_expanded_shape(img_shape, angle) if expand else img_shape # Clip coords to fit since there is no expansion - rotated_polys[..., 0] = (rotated_polys[..., 0] + target_shape[1] / 2).clip(0, target_shape[1]) - rotated_polys[..., 1] = (target_shape[0] / 2 - rotated_polys[..., 1]).clip(0, target_shape[0]) + rotated_polys[..., 0] = (rotated_polys[..., 0] + target_shape[1] / 2).clip( + 0, target_shape[1] + ) + rotated_polys[..., 1] = (target_shape[0] / 2 - rotated_polys[..., 1]).clip( + 0, target_shape[0] + ) return rotated_polys -def remap_boxes(loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape: tuple[int, int]) -> np.ndarray: +def remap_boxes( + loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape: tuple[int, int] +) -> np.ndarray: """Remaps a batch of rotated locpred (N, 4, 2) expressed for an origin_shape to a destination_shape. This does not impact the absolute shape of the boxes, but allow to calculate the new relative RotatedBbox coordinates after a resizing of the image. @@ -213,12 +232,18 @@ def remap_boxes(loc_preds: np.ndarray, orig_shape: tuple[int, int], dest_shape: if len(dest_shape) != 2: raise ValueError(f"Mask length should be 2, was found at: {len(dest_shape)}") if len(orig_shape) != 2: - raise ValueError(f"Image_shape length should be 2, was found at: {len(orig_shape)}") + raise ValueError( + f"Image_shape length should be 2, was found at: {len(orig_shape)}" + ) orig_height, orig_width = orig_shape dest_height, dest_width = dest_shape mboxes = loc_preds.copy() - mboxes[:, :, 0] = ((loc_preds[:, :, 0] * orig_width) + (dest_width - orig_width) / 2) / dest_width - mboxes[:, :, 1] = ((loc_preds[:, :, 1] * orig_height) + (dest_height - orig_height) / 2) / dest_height + mboxes[:, :, 0] = ( + (loc_preds[:, :, 0] * orig_width) + (dest_width - orig_width) / 2 + ) / dest_width + mboxes[:, :, 1] = ( + (loc_preds[:, :, 1] * orig_height) + (dest_height - orig_height) / 2 + ) / dest_height return mboxes @@ -263,19 +288,31 @@ def rotate_boxes( # Compute rotation matrix angle_rad = angle * np.pi / 180.0 # compute radian angle for np functions rotation_mat = np.array( - [[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]], dtype=_boxes.dtype + [ + [np.cos(angle_rad), -np.sin(angle_rad)], + [np.sin(angle_rad), np.cos(angle_rad)], + ], + dtype=_boxes.dtype, ) # Rotate absolute points - points: np.ndarray = np.stack((_boxes[:, :, 0] * orig_shape[1], _boxes[:, :, 1] * orig_shape[0]), axis=-1) + points: np.ndarray = np.stack( + (_boxes[:, :, 0] * orig_shape[1], _boxes[:, :, 1] * orig_shape[0]), axis=-1 + ) image_center = (orig_shape[1] / 2, orig_shape[0] / 2) rotated_points = image_center + np.matmul(points - image_center, rotation_mat) rotated_boxes: np.ndarray = np.stack( - (rotated_points[:, :, 0] / orig_shape[1], rotated_points[:, :, 1] / orig_shape[0]), axis=-1 + ( + rotated_points[:, :, 0] / orig_shape[1], + rotated_points[:, :, 1] / orig_shape[0], + ), + axis=-1, ) # Apply a mask if requested if target_shape is not None: - rotated_boxes = remap_boxes(rotated_boxes, orig_shape=orig_shape, dest_shape=target_shape) + rotated_boxes = remap_boxes( + rotated_boxes, orig_shape=orig_shape, dest_shape=target_shape + ) return rotated_boxes @@ -297,34 +334,59 @@ def rotate_image( Returns: Rotated array, padded by 0 by default. """ + # Early exit for zero angle and no padding/resize requested + if angle == 0 and not expand and not preserve_origin_shape: + return image + # Compute the expanded padding - exp_img: np.ndarray if expand: exp_shape = compute_expanded_shape(image.shape[:2], angle) - h_pad, w_pad = ( - int(max(0, ceil(exp_shape[0] - image.shape[0]))), - int(max(0, ceil(exp_shape[1] - image.shape[1]))), + exp_h, exp_w = exp_shape + h, w = image.shape[:2] + + h_pad = int(max(0, ceil(exp_h - h))) + w_pad = int(max(0, ceil(exp_w - w))) + pad_spec = ( + (h_pad // 2, h_pad - h_pad // 2), + (w_pad // 2, w_pad - w_pad // 2), + (0, 0), ) - exp_img = np.pad(image, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0))) + exp_img = np.pad(image, pad_spec) + src_h, src_w = exp_img.shape[:2] else: exp_img = image + src_h, src_w = exp_img.shape[:2] + + rot_mat = cv2.getRotationMatrix2D((src_w / 2, src_h / 2), angle, 1.0) + rot_img = cv2.warpAffine(exp_img, rot_mat, (src_w, src_h)) - height, width = exp_img.shape[:2] - rot_mat = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1.0) - rot_img = cv2.warpAffine(exp_img, rot_mat, (width, height)) if expand: - # Pad to get the same aspect ratio - if (image.shape[0] / image.shape[1]) != (rot_img.shape[0] / rot_img.shape[1]): + # Pad to get the same aspect ratio if needed + orig_h, orig_w = image.shape[:2] + orig_ratio = orig_h / orig_w + rot_ratio = rot_img.shape[0] / rot_img.shape[1] + if orig_ratio != rot_ratio: # Pad width - if (rot_img.shape[0] / rot_img.shape[1]) > (image.shape[0] / image.shape[1]): - h_pad, w_pad = 0, int(rot_img.shape[0] * image.shape[1] / image.shape[0] - rot_img.shape[1]) + if rot_ratio > orig_ratio: + new_w = int(rot_img.shape[0] * orig_w / orig_h) + w_pad = new_w - rot_img.shape[1] + h_pad = 0 # Pad height else: - h_pad, w_pad = int(rot_img.shape[1] * image.shape[0] / image.shape[1] - rot_img.shape[0]), 0 - rot_img = np.pad(rot_img, ((h_pad // 2, h_pad - h_pad // 2), (w_pad // 2, w_pad - w_pad // 2), (0, 0))) + new_h = int(rot_img.shape[1] * orig_h / orig_w) + h_pad = new_h - rot_img.shape[0] + w_pad = 0 + pad_spec = ( + (h_pad // 2, h_pad - h_pad // 2), + (w_pad // 2, w_pad - w_pad // 2), + (0, 0), + ) + rot_img = np.pad(rot_img, pad_spec) + if preserve_origin_shape: - # rescale - rot_img = cv2.resize(rot_img, image.shape[:-1][::-1], interpolation=cv2.INTER_LINEAR) + rot_img = cv2.resize( + rot_img, (orig_w, orig_h), interpolation=cv2.INTER_LINEAR + ) return rot_img @@ -359,13 +421,17 @@ def estimate_page_angle(polys: np.ndarray) -> float: with np.errstate(divide="raise", invalid="raise"): try: return float( - np.median(np.arctan((yleft - yright) / (xright - xleft)) * 180 / np.pi) # Y axis from top to bottom! + np.median( + np.arctan((yleft - yright) / (xright - xleft)) * 180 / np.pi + ) # Y axis from top to bottom! ) except FloatingPointError: return 0.0 -def convert_to_relative_coords(geoms: np.ndarray, img_shape: tuple[int, int]) -> np.ndarray: +def convert_to_relative_coords( + geoms: np.ndarray, img_shape: tuple[int, int] +) -> np.ndarray: """Convert a geometry to relative coordinates Args: @@ -404,7 +470,9 @@ def extract_crops(img: np.ndarray, boxes: np.ndarray) -> list[np.ndarray]: if boxes.shape[0] == 0: return [] if boxes.shape[1] != 4: - raise AssertionError("boxes are expected to be relative and in order (xmin, ymin, xmax, ymax)") + raise AssertionError( + "boxes are expected to be relative and in order (xmin, ymin, xmax, ymax)" + ) # Project relative coordinates _boxes = boxes.copy() @@ -420,7 +488,10 @@ def extract_crops(img: np.ndarray, boxes: np.ndarray) -> list[np.ndarray]: def extract_rcrops( - img: np.ndarray, polys: np.ndarray, dtype=np.float32, assume_horizontal: bool = False + img: np.ndarray, + polys: np.ndarray, + dtype=np.float32, + assume_horizontal: bool = False, ) -> list[np.ndarray]: """Created cropped images from list of rotated bounding boxes @@ -436,7 +507,9 @@ def extract_rcrops( if polys.shape[0] == 0: return [] if polys.shape[1:] != (4, 2): - raise AssertionError("polys are expected to be quadrilateral, of shape (N, 4, 2)") + raise AssertionError( + "polys are expected to be quadrilateral, of shape (N, 4, 2)" + ) # Project relative coordinates _boxes = polys.copy()