diff --git a/.gitignore b/.gitignore index db94eb1..f374bac 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,9 @@ cython_debug/ # VS Code # Add .vscode/ to the .gitignore file if you are using Visual Studio Code .vscode/ + +*.jpg +*.jpeg +*.png +*.gif +output*.txt diff --git a/.pylintrc b/.pylintrc index 30be330..9adf0f8 100644 --- a/.pylintrc +++ b/.pylintrc @@ -7,4 +7,6 @@ disable= C0411, # Wrong import order E1101, # No member W0612, # Unused variable - W0718 # Broad exception caught \ No newline at end of file + W0718, # Broad exception caught + R1705, # Unnecessary "else" after "return" + R0902, # Too many instance attributes \ No newline at end of file diff --git a/OCR4Linux.py b/OCR4Linux.py index 0ca8fd0..be45450 100644 --- a/OCR4Linux.py +++ b/OCR4Linux.py @@ -1,7 +1,7 @@ # ======================================================================================================================== # Author: # Mohamed Hussein Al-Adawy -# Version: 1.1.0 +# Version: 1.2.0 # Description: # OCR4Linux.py is a Python script that handles image preprocessing and text extraction using Tesseract OCR. # The script takes an input image, processes it for optimal OCR accuracy, and extracts text while preserving @@ -38,182 +38,240 @@ import os from PIL import Image import pytesseract -import cv2 -import numpy as np class TesseractConfig: """ - TesseractConfig is a class that provides functionality to preprocess images, - and extract text from them using Tesseract OCR. + TesseractConfig is a class that configures and uses Tesseract OCR to extract text from images. + + langs (str): The languages to be used by Tesseract for OCR. + custom_config (str): Custom configuration string for Tesseract. + ouput_encoding (str): The encoding to be used for the output file. Methods: - __init__(): - Initializes the TesseractConfig instance with command line arguments. - - preprocess_image(image): - Preprocesses the given image to improve OCR accuracy. - Args: - image (PIL.Image): The image to preprocess. - Returns: - PIL.Image: The preprocessed image. - - extract_text_with_lines(image): - Extracts text from the given image while preserving line breaks. - Args: - image (PIL.Image): The image from which to extract text. - Returns: - str: The extracted text with line breaks preserved. - - help(): - Prints the usage information for the script. - - main(): - The main method that processes the image and extracts text. - Returns: - int: 0 if successful, 1 otherwise. + __init__(self, image_path: str, output_path: str): + Initializes the TesseractConfig class with the provided image and output file paths. + + extract_text_with_lines(image: Image) -> str: + Uses Tesseract OCR to extract text from the provided image, preserving line breaks. + + main() -> int: + Main function to process the image and extract text. Performs validation, image processing, + text extraction, and saves the extracted text to an output file. Returns 0 if successful, 1 otherwise. """ - def __init__(self): + def __init__(self, image_path: str, output_path: str): """ Initializes the OCR4Linux class with command-line arguments. Attributes: - args_num (int): The number of expected command-line arguments. - script_name (str): The name of the script being executed. image_path (str): The path to the input image file. output_path (str): The path to the output file where results will be saved. + oem_mode (int): The OCR Engine Mode (OEM) for Tesseract. + psm_mode (int): The Page Segmentation Mode (PSM) for Tesseract. + langs (str): The languages to be used by Tesseract for OCR. + custom_config (str): Custom configuration string for Tesseract. + ouput_encoding (str): The encoding to be used for the output file. """ - self.args_num = 3 - self.script_name = sys.argv[0] - self.image_path = sys.argv[1] - self.output_path = sys.argv[2] + self.image_path = image_path + self.output_path = output_path + self.oem_mode = 3 # Default LSTM engine + self.psm_mode = 6 # Uniform block of text + self.available_langs = pytesseract.get_languages() + self.langs = '+'.join(filter(None, self.available_langs) + ) if self.available_langs else 'eng' + self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}' + self.ouput_encoding = 'utf-8' - def preprocess_image(self, image) -> Image: + def extract_text_with_lines(self, image: Image) -> str: """ - Preprocess image for better OCR accuracy. - - This function converts the input image to grayscale, applies thresholding - to binarize the image, and removes noise using a median blur filter. + This method uses Tesseract OCR to extract text from the provided image. Args: - image (PIL.Image.Image): The input image to preprocess. + image: The image from which to extract text. This should be a format + supported by the pytesseract library. Returns: - PIL.Image.Image: The preprocessed image. + A string containing the extracted text with line breaks preserved. """ - # Convert to grayscale - gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) - # Apply thresholding - thresh = cv2.threshold( - gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] - # Noise removal - denoised = cv2.medianBlur(thresh, 3) - return Image.fromarray(denoised) + return pytesseract.image_to_string( + image=image, lang=self.langs, config=self.custom_config) - def extract_text_with_lines(self, image: Image) -> str: + def main(self) -> int: """ - Extract text from an image while preserving line breaks. - - This method uses Tesseract OCR to extract text from the provided image, - preserving the layout and line breaks. It filters out low-confidence - results to improve the accuracy of the extracted text. + Main function to process the image and extract text. - Args: - image: The image from which to extract text. This should be a format - supported by the pytesseract library. + This function performs the following steps: + 1. Extracts text from the processed image while preserving line breaks. + 2. Saves the extracted text to an output file. Returns: - A string containing the extracted text with line breaks preserved. + int: 0 if text extraction is successful, 1 otherwise. """ - # Get image dimensions - custom_config = r'--oem 3 --psm 6' - # Extract text with layout preservation - data = pytesseract.image_to_data( - image, config=custom_config, output_type=pytesseract.Output.DICT) - - # Group text by line - lines = {} - for i, _ in enumerate(data['level']): - if int(data['conf'][i]) > 60: # Filter low confidence results - page_num = data['page_num'][i] - block_num = data['block_num'][i] - par_num = data['par_num'][i] - line_num = data['line_num'][i] - - key = f"{page_num}_{block_num}_{par_num}_{line_num}" - if key not in lines: - lines[key] = [] - lines[key].append(data['text'][i]) - - # Join text preserving line breaks - return '\n'.join(' '.join(line).strip() for line in lines.values() if ''.join(line).strip()) + try: + # Open and process the image + with Image.open(self.image_path) as image: + # Extract text with line preservation + extracted_text = self.extract_text_with_lines(image) + + # Save the extracted text to a file + with open(self.output_path, 'w', encoding=self.ouput_encoding) as file: + file.write(extracted_text) + + return 0 + + except Exception as e: + print(f"Error processing image because: {str(e)}") + return 1 + + +class Program: + def __init__(self): + """ + Initializes the OCR4Linux class with the following attributes: + - args_num: Number of arguments expected by the script. + - author: Author of the script. + - email: Author's email address. + - github: URL to the GitHub repository. + - version: Version of the script. + - description: Brief description of the script's functionality. + - useges: List of usage examples for the script. + - examples: List of example commands for using the script. + - arguments: List of arguments that the script accepts with their descriptions. + """ + self.args_num = 3 + self.author = "Mohamed Hussein Al-Adawy" + self.email = "mohamed.h.eladwy@gmail.com" + self.github = "https://github.com/moheladwy/OCR4Linux" + self.version = "1.2.0" + self.description = \ + " OCR4Linux.py is a Python script that handles image preprocessing\n" + \ + " and text extraction using Tesseract OCR. The script takes an input\n" + \ + " based on the language in the image." + self.useges = [ + "python OCR4Linux.py ", + "python OCR4Linux.py [-l | --list-langs]", + "python OCR4Linux.py [-h | --help]" + ] + self.examples = [ + "python OCR4Linux.py screenshot.png output.txt", + "python OCR4Linux.py -l", + "python OCR4Linux.py -h" + ] + self.arguments = [ + "file_path: Path to the python script", + "image_path: Path to the image file", + "output_path: Path to the output text file", + "-l, --list-langs: List all available languages for OCR in the system", + "-h, --help: Display this help message, then exit" + ] def help(self) -> None: """ Prints the usage instructions for the OCR4Linux script. This method displays the correct way to run the script, including the required - arguments and their descriptions. - - Usage: - python - - Arguments: - file_path: Path to the python script - image_path: Path to the image file - output_path: Path to the output text file + arguments and their descriptions. It also provides examples of how to use the script. """ - print(f"Usage: python {self.script_name} ") + print("OCR4Linux - OCR script for Linux using Tesseract") + print(f"Version: {self.version}") + print(f"Author: {self.author}") + print(f"Email: {self.email}") + print(f"GitHub: {self.github}") + print() + print("Description:") + print(self.description) + print() + print("Usage:") + for usege in self.useges: + print(f" - {usege}") + print() + print("Example:") + for example in self.examples: + print(f" - {example}") + print() print("Arguments:") - print(" file_path: Path to the python script") - print(" image_path: Path to the image file") - print(" output_path: Path to the output text file") + for argument in self.arguments: + print(f" {argument}") - def main(self) -> int: + def check_arguments(self) -> int: """ - Main function to process the image and extract text. + Checks the command line arguments for validity. - This function performs the following steps: - 1. Checks command line arguments for validity. - 2. Verifies if the specified image file exists. - 3. Opens and processes the image. - 4. Extracts text from the processed image while preserving line breaks. - 5. Saves the extracted text to an output file. + Handles the following options: + - Standard usage: + - Help: -h or --help + - List languages: -l or --list-langs Returns: - int: 0 if text extraction is successful, 1 otherwise. + bool: True if arguments are valid, False otherwise. """ - # Check command line arguments - if len(sys.argv) != self.args_num or sys.argv[1] in ['-h', '--help']: + if len(sys.argv) == 2 and sys.argv[1] in ['-l', '--list-langs']: + self.list_available_languages() + return 0 + elif len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']: + self.help() + return 0 + elif len(sys.argv) != self.args_num: self.help() return 1 + return 2 - # Check if file exists - if not os.path.exists(self.image_path): - print(f"Error: File '{self.image_path}' not found") - return 1 + def list_available_languages(self) -> None: + """ + Displays all available languages for Tesseract OCR. + """ + langs = pytesseract.get_languages() + if not langs: + print("Error: No languages found") + return - try: - # Open and process the image - with Image.open(self.image_path) as image: - # Preprocess the image - processed_image = self.preprocess_image(image) + print("Available languages for OCR:") + for lang in langs: + print(f" - {lang}") - # Extract text with line preservation - extracted_text = self.extract_text_with_lines(processed_image) + def check_image_path(self, image_path: str) -> bool: + """ + Checks if the specified image file exists. - # Save the extracted text to a file - with open(self.output_path, 'w', encoding='utf-8') as file: - file.write(extracted_text) + Args: + image_path: The path to the image file to be checked. - print("Text extraction completed successfully") - return 0 + Returns: + bool: True if the image file exists, False otherwise. + """ + if not os.path.exists(image_path): + print(f"Error: File '{image_path}' not found") + return False + return True - except Exception as e: - print(f"Error processing image because: {str(e)}") + def main(self): + """ + Main function to execute the OCR process. + + This function performs the following steps: + 1. Checks if the correct number of arguments is provided. + 2. Verifies if the image file exists. + 3. Creates an instance of the TesseractConfig class and runs the OCR process. + + Returns: + int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function. + """ + # Check if the correct number of arguments is provided + result = self.check_arguments() + if result == 1: + return 1 + elif result == 0: + return 0 + + # Check if the image file exists + if not self.check_image_path(sys.argv[1]): return 1 + # Create an instance of the TesseractConfig class + tesseract = TesseractConfig(sys.argv[1], sys.argv[2]) + return tesseract.main() + if __name__ == "__main__": - sys.exit(TesseractConfig().main()) + sys.exit(Program().main()) diff --git a/OCR4Linux.sh b/OCR4Linux.sh index 985f2fb..94c9b96 100755 --- a/OCR4Linux.sh +++ b/OCR4Linux.sh @@ -1,7 +1,7 @@ #!/bin/bash # ======================================================================================================================== # Author: Mohamed Hussein Al-Adawy -# Version: 1.1.0 +# Version: 1.2.0 # Description: # OCR4Linux is a versatile text extraction tool for Linux systems that: # 1. Takes screenshots of selected areas using: diff --git a/setup.sh b/setup.sh index 1e4126d..d29a3d8 100755 --- a/setup.sh +++ b/setup.sh @@ -2,7 +2,7 @@ # ======================================================================================================================== # Author: # Mohamed Hussein Al-Adawy -# Version: 1.1.0 +# Version: 1.2.0 # Description: # This setup script installs and configures OCR4Linux and its dependencies. # It handles the installation of: