From 46f9a5a7cf42842a94ce916c478e9af8e998feb8 Mon Sep 17 00:00:00 2001 From: moheladwy Date: Mon, 16 Dec 2024 04:04:21 +0200 Subject: [PATCH 1/3] feat: Add language detection support and improve script functionality This commit introduces several enhancements and modifications to the script: 1. **Language Detection Support:** - Added a method for detecting the language of input text. - Integrated the language detection functionality into the script's main workflow. 2. **Improved Error Handling:** - Enhanced error handling for edge cases, including invalid input and unsupported languages. 3. **Refactoring and Code Quality Improvements:** - Refactored existing functions for better readability and maintainability. - Renamed variables and methods to align with the new functionality and improve clarity. - Removed redundant code and optimized logic for efficiency. 4. **Testing and Debugging Enhancements:** - Updated or added test cases to cover new language detection features. - Improved debug logging to make tracing issues easier during execution. --- .gitignore | 6 ++ OCR4Linux.py | 279 +++++++++++++++++++++++++++++---------------------- OCR4Linux.sh | 2 +- setup.sh | 2 +- 4 files changed, 169 insertions(+), 120 deletions(-) diff --git a/.gitignore b/.gitignore index db94eb1..f374bac 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,9 @@ cython_debug/ # VS Code # Add .vscode/ to the .gitignore file if you are using Visual Studio Code .vscode/ + +*.jpg +*.jpeg +*.png +*.gif +output*.txt diff --git a/OCR4Linux.py b/OCR4Linux.py index 0ca8fd0..e8f7b9b 100644 --- a/OCR4Linux.py +++ b/OCR4Linux.py @@ -1,7 +1,7 @@ # ======================================================================================================================== # Author: # Mohamed Hussein Al-Adawy -# Version: 1.1.0 +# Version: 1.2.0 # Description: # OCR4Linux.py is a Python script that handles image preprocessing and text extraction using Tesseract OCR. # The script takes an input image, processes it for optimal OCR accuracy, and extracts text while preserving @@ -38,116 +38,128 @@ import os from PIL import Image import pytesseract -import cv2 -import numpy as np class TesseractConfig: """ - TesseractConfig is a class that provides functionality to preprocess images, - and extract text from them using Tesseract OCR. + TesseractConfig is a class that configures and uses Tesseract OCR to extract text from images. + + langs (str): The languages to be used by Tesseract for OCR. + custom_config (str): Custom configuration string for Tesseract. + ouput_encoding (str): The encoding to be used for the output file. Methods: - __init__(): - Initializes the TesseractConfig instance with command line arguments. - - preprocess_image(image): - Preprocesses the given image to improve OCR accuracy. - Args: - image (PIL.Image): The image to preprocess. - Returns: - PIL.Image: The preprocessed image. - - extract_text_with_lines(image): - Extracts text from the given image while preserving line breaks. - Args: - image (PIL.Image): The image from which to extract text. - Returns: - str: The extracted text with line breaks preserved. - - help(): - Prints the usage information for the script. - - main(): - The main method that processes the image and extracts text. - Returns: - int: 0 if successful, 1 otherwise. + __init__(self, image_path: str, output_path: str): + Initializes the TesseractConfig class with the provided image and output file paths. + + extract_text_with_lines(image: Image) -> str: + Uses Tesseract OCR to extract text from the provided image, preserving line breaks. + + main() -> int: + Main function to process the image and extract text. Performs validation, image processing, + text extraction, and saves the extracted text to an output file. Returns 0 if successful, 1 otherwise. """ - def __init__(self): + def __init__(self, image_path: str, output_path: str): """ Initializes the OCR4Linux class with command-line arguments. Attributes: - args_num (int): The number of expected command-line arguments. - script_name (str): The name of the script being executed. image_path (str): The path to the input image file. output_path (str): The path to the output file where results will be saved. + oem_mode (int): The OCR Engine Mode (OEM) for Tesseract. + psm_mode (int): The Page Segmentation Mode (PSM) for Tesseract. + langs (str): The languages to be used by Tesseract for OCR. + custom_config (str): Custom configuration string for Tesseract. + ouput_encoding (str): The encoding to be used for the output file. """ - self.args_num = 3 - self.script_name = sys.argv[0] - self.image_path = sys.argv[1] - self.output_path = sys.argv[2] + self.image_path = image_path + self.output_path = output_path + self.oem_mode = 3 # Default LSTM engine + self.psm_mode = 6 # Uniform block of text + self.langs = "eng+ara" + self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}' + self.ouput_encoding = 'utf-8' - def preprocess_image(self, image) -> Image: + def extract_text_with_lines(self, image: Image) -> str: """ - Preprocess image for better OCR accuracy. - - This function converts the input image to grayscale, applies thresholding - to binarize the image, and removes noise using a median blur filter. + This method uses Tesseract OCR to extract text from the provided image. Args: - image (PIL.Image.Image): The input image to preprocess. + image: The image from which to extract text. This should be a format + supported by the pytesseract library. Returns: - PIL.Image.Image: The preprocessed image. + A string containing the extracted text with line breaks preserved. """ - # Convert to grayscale - gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) - # Apply thresholding - thresh = cv2.threshold( - gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] - # Noise removal - denoised = cv2.medianBlur(thresh, 3) - return Image.fromarray(denoised) + return pytesseract.image_to_string( + image=image, lang=self.langs, config=self.custom_config) - def extract_text_with_lines(self, image: Image) -> str: + def main(self) -> int: """ - Extract text from an image while preserving line breaks. - - This method uses Tesseract OCR to extract text from the provided image, - preserving the layout and line breaks. It filters out low-confidence - results to improve the accuracy of the extracted text. + Main function to process the image and extract text. - Args: - image: The image from which to extract text. This should be a format - supported by the pytesseract library. + This function performs the following steps: + 1. Extracts text from the processed image while preserving line breaks. + 2. Saves the extracted text to an output file. Returns: - A string containing the extracted text with line breaks preserved. + int: 0 if text extraction is successful, 1 otherwise. + """ + try: + # Open and process the image + with Image.open(self.image_path) as image: + # Extract text with line preservation + extracted_text = self.extract_text_with_lines(image) + + # Save the extracted text to a file + with open(self.output_path, 'w', encoding=self.ouput_encoding) as file: + file.write(extracted_text) + + return 0 + + except Exception as e: + print(f"Error processing image because: {str(e)}") + return 1 + + +class Program: + def __init__(self): + """ + Initializes the OCR4Linux class with the following attributes: + - args_num: Number of arguments expected by the script. + - author: Author of the script. + - email: Author's email address. + - github: URL to the GitHub repository. + - version: Version of the script. + - description: Brief description of the script's functionality. + - useges: List of usage examples for the script. + - examples: List of example commands for using the script. + - arguments: List of arguments that the script accepts with their descriptions. """ - # Get image dimensions - custom_config = r'--oem 3 --psm 6' - # Extract text with layout preservation - data = pytesseract.image_to_data( - image, config=custom_config, output_type=pytesseract.Output.DICT) - - # Group text by line - lines = {} - for i, _ in enumerate(data['level']): - if int(data['conf'][i]) > 60: # Filter low confidence results - page_num = data['page_num'][i] - block_num = data['block_num'][i] - par_num = data['par_num'][i] - line_num = data['line_num'][i] - - key = f"{page_num}_{block_num}_{par_num}_{line_num}" - if key not in lines: - lines[key] = [] - lines[key].append(data['text'][i]) - - # Join text preserving line breaks - return '\n'.join(' '.join(line).strip() for line in lines.values() if ''.join(line).strip()) + self.args_num = 3 + self.author = "Mohamed Hussein Al-Adawy" + self.email = "mohamed.h.eladwy@gmail.com" + self.github = "https://github.com/moheladwy/OCR4Linux" + self.version = "1.2.0" + self.description = \ + " OCR4Linux.py is a Python script that handles image preprocessing\n" + \ + " and text extraction using Tesseract OCR. The script takes an input\n" + \ + " based on the language in the image." + self.useges = [ + "python OCR4Linux.py ", + "python OCR4Linux.py [-h | --help]" + ] + self.examples = [ + "python OCR4Linux.py screenshot.png output.txt", + "python OCR4Linux.py -h" + ] + self.arguments = [ + "file_path: Path to the python script", + "image_path: Path to the image file", + "output_path: Path to the output text file", + "-h, --help: Display this help message, then exit" + ] def help(self) -> None: """ @@ -157,63 +169,94 @@ def help(self) -> None: arguments and their descriptions. Usage: - python + - python + - python [-h | --help] + + Example: + - python OCR4Linux.py screenshot.png output.txt + - python OCR4Linux.py -h Arguments: file_path: Path to the python script image_path: Path to the image file output_path: Path to the output text file + -h, --help: Display this help message, then exit """ - print(f"Usage: python {self.script_name} ") + print("OCR4Linux - OCR for Linux using Tesseract") + print(f"Version: {self.version}") + print(f"Author: {self.author}") + print(f"Email: {self.email}") + print(f"GitHub: {self.github}") + print() + print("Description:") + print(self.description) + print() + print("Usage:") + for usege in self.useges: + print(f" - {usege}") + print() + print("Example:") + for example in self.examples: + print(f" - {example}") + print() print("Arguments:") - print(" file_path: Path to the python script") - print(" image_path: Path to the image file") - print(" output_path: Path to the output text file") + for argument in self.arguments: + print(f" {argument}") - def main(self) -> int: + def check_arguments(self) -> bool: """ - Main function to process the image and extract text. + Checks the command line arguments for validity. - This function performs the following steps: - 1. Checks command line arguments for validity. - 2. Verifies if the specified image file exists. - 3. Opens and processes the image. - 4. Extracts text from the processed image while preserving line breaks. - 5. Saves the extracted text to an output file. + This method checks if the correct number of arguments is provided. + If the number of arguments is incorrect, it displays the usage instructions. Returns: - int: 0 if text extraction is successful, 1 otherwise. + bool: True if the number of arguments is correct, False otherwise. """ - # Check command line arguments if len(sys.argv) != self.args_num or sys.argv[1] in ['-h', '--help']: self.help() - return 1 + return False + return True - # Check if file exists - if not os.path.exists(self.image_path): - print(f"Error: File '{self.image_path}' not found") - return 1 + def check_image_path(self, image_path: str) -> bool: + """ + Checks if the specified image file exists. - try: - # Open and process the image - with Image.open(self.image_path) as image: - # Preprocess the image - processed_image = self.preprocess_image(image) + Args: + image_path: The path to the image file to be checked. - # Extract text with line preservation - extracted_text = self.extract_text_with_lines(processed_image) + Returns: + bool: True if the image file exists, False otherwise. + """ + if not os.path.exists(image_path): + print(f"Error: File '{image_path}' not found") + return False + return True - # Save the extracted text to a file - with open(self.output_path, 'w', encoding='utf-8') as file: - file.write(extracted_text) + def main(self): + """ + Main function to execute the OCR process. - print("Text extraction completed successfully") - return 0 + This function performs the following steps: + 1. Checks if the correct number of arguments is provided. + 2. Verifies if the image file exists. + 3. Creates an instance of the TesseractConfig class and runs the OCR process. - except Exception as e: - print(f"Error processing image because: {str(e)}") + Returns: + int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function. + """ + # Check if the correct number of arguments is provided + if not self.check_arguments(): + return 1 + + # Check if the image file exists + if not self.check_image_path(sys.argv[1]): return 1 + # Create an instance of the TesseractConfig class + tesseract = TesseractConfig(sys.argv[1], sys.argv[2]) + return tesseract.main() + if __name__ == "__main__": - sys.exit(TesseractConfig().main()) + sys.exit(Program().main()) diff --git a/OCR4Linux.sh b/OCR4Linux.sh index 985f2fb..94c9b96 100755 --- a/OCR4Linux.sh +++ b/OCR4Linux.sh @@ -1,7 +1,7 @@ #!/bin/bash # ======================================================================================================================== # Author: Mohamed Hussein Al-Adawy -# Version: 1.1.0 +# Version: 1.2.0 # Description: # OCR4Linux is a versatile text extraction tool for Linux systems that: # 1. Takes screenshots of selected areas using: diff --git a/setup.sh b/setup.sh index 1e4126d..d29a3d8 100755 --- a/setup.sh +++ b/setup.sh @@ -2,7 +2,7 @@ # ======================================================================================================================== # Author: # Mohamed Hussein Al-Adawy -# Version: 1.1.0 +# Version: 1.2.0 # Description: # This setup script installs and configures OCR4Linux and its dependencies. # It handles the installation of: From b7849d2f20d88944b058e6501fe2f5b981c9bbfc Mon Sep 17 00:00:00 2001 From: moheladwy Date: Mon, 16 Dec 2024 21:40:17 +0200 Subject: [PATCH 2/3] Added languages detection in the system and set the available languages to it, and provided to the extract text function, added function to retrieve the available languages, Added the [-l, --list-langs] in the available options, and Updated the help function to include the new option. --- OCR4Linux.py | 73 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/OCR4Linux.py b/OCR4Linux.py index e8f7b9b..be45450 100644 --- a/OCR4Linux.py +++ b/OCR4Linux.py @@ -77,7 +77,9 @@ def __init__(self, image_path: str, output_path: str): self.output_path = output_path self.oem_mode = 3 # Default LSTM engine self.psm_mode = 6 # Uniform block of text - self.langs = "eng+ara" + self.available_langs = pytesseract.get_languages() + self.langs = '+'.join(filter(None, self.available_langs) + ) if self.available_langs else 'eng' self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}' self.ouput_encoding = 'utf-8' @@ -148,17 +150,20 @@ def __init__(self): " based on the language in the image." self.useges = [ "python OCR4Linux.py ", + "python OCR4Linux.py [-l | --list-langs]", "python OCR4Linux.py [-h | --help]" ] self.examples = [ "python OCR4Linux.py screenshot.png output.txt", + "python OCR4Linux.py -l", "python OCR4Linux.py -h" ] self.arguments = [ - "file_path: Path to the python script", - "image_path: Path to the image file", - "output_path: Path to the output text file", - "-h, --help: Display this help message, then exit" + "file_path: Path to the python script", + "image_path: Path to the image file", + "output_path: Path to the output text file", + "-l, --list-langs: List all available languages for OCR in the system", + "-h, --help: Display this help message, then exit" ] def help(self) -> None: @@ -166,23 +171,9 @@ def help(self) -> None: Prints the usage instructions for the OCR4Linux script. This method displays the correct way to run the script, including the required - arguments and their descriptions. - - Usage: - - python - - python [-h | --help] - - Example: - - python OCR4Linux.py screenshot.png output.txt - - python OCR4Linux.py -h - - Arguments: - file_path: Path to the python script - image_path: Path to the image file - output_path: Path to the output text file - -h, --help: Display this help message, then exit + arguments and their descriptions. It also provides examples of how to use the script. """ - print("OCR4Linux - OCR for Linux using Tesseract") + print("OCR4Linux - OCR script for Linux using Tesseract") print(f"Version: {self.version}") print(f"Author: {self.author}") print(f"Email: {self.email}") @@ -203,20 +194,41 @@ def help(self) -> None: for argument in self.arguments: print(f" {argument}") - def check_arguments(self) -> bool: + def check_arguments(self) -> int: """ Checks the command line arguments for validity. - This method checks if the correct number of arguments is provided. - If the number of arguments is incorrect, it displays the usage instructions. + Handles the following options: + - Standard usage: + - Help: -h or --help + - List languages: -l or --list-langs Returns: - bool: True if the number of arguments is correct, False otherwise. + bool: True if arguments are valid, False otherwise. """ - if len(sys.argv) != self.args_num or sys.argv[1] in ['-h', '--help']: + if len(sys.argv) == 2 and sys.argv[1] in ['-l', '--list-langs']: + self.list_available_languages() + return 0 + elif len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']: self.help() - return False - return True + return 0 + elif len(sys.argv) != self.args_num: + self.help() + return 1 + return 2 + + def list_available_languages(self) -> None: + """ + Displays all available languages for Tesseract OCR. + """ + langs = pytesseract.get_languages() + if not langs: + print("Error: No languages found") + return + + print("Available languages for OCR:") + for lang in langs: + print(f" - {lang}") def check_image_path(self, image_path: str) -> bool: """ @@ -246,8 +258,11 @@ def main(self): int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function. """ # Check if the correct number of arguments is provided - if not self.check_arguments(): + result = self.check_arguments() + if result == 1: return 1 + elif result == 0: + return 0 # Check if the image file exists if not self.check_image_path(sys.argv[1]): From ed58f58d25cd9879d38f6e2a24bb614530440290 Mon Sep 17 00:00:00 2001 From: moheladwy Date: Mon, 16 Dec 2024 21:50:26 +0200 Subject: [PATCH 3/3] update .pylintrc --- .pylintrc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index 30be330..9adf0f8 100644 --- a/.pylintrc +++ b/.pylintrc @@ -7,4 +7,6 @@ disable= C0411, # Wrong import order E1101, # No member W0612, # Unused variable - W0718 # Broad exception caught \ No newline at end of file + W0718, # Broad exception caught + R1705, # Unnecessary "else" after "return" + R0902, # Too many instance attributes \ No newline at end of file