From 46f9a5a7cf42842a94ce916c478e9af8e998feb8 Mon Sep 17 00:00:00 2001
From: moheladwy <mohamed.h.eladwy@gmail.com>
Date: Mon, 16 Dec 2024 04:04:21 +0200
Subject: [PATCH 1/3] feat: Add language detection support and improve script
 functionality

This commit introduces several enhancements and modifications to the script:

1. **Language Detection Support:**
   - Added a method for detecting the language of input text.
   - Integrated the language detection functionality into the script's main workflow.

2. **Improved Error Handling:**
   - Enhanced error handling for edge cases, including invalid input and unsupported languages.

3. **Refactoring and Code Quality Improvements:**
   - Refactored existing functions for better readability and maintainability.
   - Renamed variables and methods to align with the new functionality and improve clarity.
   - Removed redundant code and optimized logic for efficiency.

4. **Testing and Debugging Enhancements:**
   - Updated or added test cases to cover new language detection features.
   - Improved debug logging to make tracing issues easier during execution.
---
 .gitignore   |   6 ++
 OCR4Linux.py | 279 +++++++++++++++++++++++++++++----------------------
 OCR4Linux.sh |   2 +-
 setup.sh     |   2 +-
 4 files changed, 169 insertions(+), 120 deletions(-)

diff --git a/.gitignore b/.gitignore
index db94eb1..f374bac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,9 @@ cython_debug/
 # VS Code
 #  Add .vscode/ to the .gitignore file if you are using Visual Studio Code
 .vscode/
+
+*.jpg
+*.jpeg
+*.png
+*.gif
+output*.txt
diff --git a/OCR4Linux.py b/OCR4Linux.py
index 0ca8fd0..e8f7b9b 100644
--- a/OCR4Linux.py
+++ b/OCR4Linux.py
@@ -1,7 +1,7 @@
 # ========================================================================================================================
 # Author:
 #     Mohamed Hussein Al-Adawy
-# Version: 1.1.0
+# Version: 1.2.0
 # Description:
 #     OCR4Linux.py is a Python script that handles image preprocessing and text extraction using Tesseract OCR.
 #     The script takes an input image, processes it for optimal OCR accuracy, and extracts text while preserving
@@ -38,116 +38,128 @@
 import os
 from PIL import Image
 import pytesseract
-import cv2
-import numpy as np
 
 
 class TesseractConfig:
     """
-    TesseractConfig is a class that provides functionality to preprocess images,
-    and extract text from them using Tesseract OCR.
+    TesseractConfig is a class that configures and uses Tesseract OCR to extract text from images.
+
+        langs (str): The languages to be used by Tesseract for OCR.
+        custom_config (str): Custom configuration string for Tesseract.
+        ouput_encoding (str): The encoding to be used for the output file.
 
     Methods:
-        __init__():
-            Initializes the TesseractConfig instance with command line arguments.
-
-        preprocess_image(image):
-            Preprocesses the given image to improve OCR accuracy.
-            Args:
-                image (PIL.Image): The image to preprocess.
-            Returns:
-                PIL.Image: The preprocessed image.
-
-        extract_text_with_lines(image):
-            Extracts text from the given image while preserving line breaks.
-            Args:
-                image (PIL.Image): The image from which to extract text.
-            Returns:
-                str: The extracted text with line breaks preserved.
-
-        help():
-            Prints the usage information for the script.
-
-        main():
-            The main method that processes the image and extracts text.
-            Returns:
-                int: 0 if successful, 1 otherwise.
+        __init__(self, image_path: str, output_path: str):
+            Initializes the TesseractConfig class with the provided image and output file paths.
+
+        extract_text_with_lines(image: Image) -> str:
+            Uses Tesseract OCR to extract text from the provided image, preserving line breaks.
+
+        main() -> int:
+            Main function to process the image and extract text. Performs validation, image processing,
+            text extraction, and saves the extracted text to an output file. Returns 0 if successful, 1 otherwise.
     """
 
-    def __init__(self):
+    def __init__(self, image_path: str, output_path: str):
         """
         Initializes the OCR4Linux class with command-line arguments.
 
         Attributes:
-            args_num (int): The number of expected command-line arguments.
-            script_name (str): The name of the script being executed.
             image_path (str): The path to the input image file.
             output_path (str): The path to the output file where results will be saved.
+            oem_mode (int): The OCR Engine Mode (OEM) for Tesseract.
+            psm_mode (int): The Page Segmentation Mode (PSM) for Tesseract.
+            langs (str): The languages to be used by Tesseract for OCR.
+            custom_config (str): Custom configuration string for Tesseract.
+            ouput_encoding (str): The encoding to be used for the output file.
         """
-        self.args_num = 3
-        self.script_name = sys.argv[0]
-        self.image_path = sys.argv[1]
-        self.output_path = sys.argv[2]
+        self.image_path = image_path
+        self.output_path = output_path
+        self.oem_mode = 3  # Default LSTM engine
+        self.psm_mode = 6  # Uniform block of text
+        self.langs = "eng+ara"
+        self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}'
+        self.ouput_encoding = 'utf-8'
 
-    def preprocess_image(self, image) -> Image:
+    def extract_text_with_lines(self, image: Image) -> str:
         """
-        Preprocess image for better OCR accuracy.
-
-        This function converts the input image to grayscale, applies thresholding 
-        to binarize the image, and removes noise using a median blur filter.
+        This method uses Tesseract OCR to extract text from the provided image.
 
         Args:
-            image (PIL.Image.Image): The input image to preprocess.
+            image: The image from which to extract text. This should be a format
+                   supported by the pytesseract library.
 
         Returns:
-            PIL.Image.Image: The preprocessed image.
+            A string containing the extracted text with line breaks preserved.
         """
-        # Convert to grayscale
-        gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
-        # Apply thresholding
-        thresh = cv2.threshold(
-            gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
-        # Noise removal
-        denoised = cv2.medianBlur(thresh, 3)
-        return Image.fromarray(denoised)
+        return pytesseract.image_to_string(
+            image=image, lang=self.langs, config=self.custom_config)
 
-    def extract_text_with_lines(self, image: Image) -> str:
+    def main(self) -> int:
         """
-        Extract text from an image while preserving line breaks.
-
-        This method uses Tesseract OCR to extract text from the provided image,
-        preserving the layout and line breaks. It filters out low-confidence
-        results to improve the accuracy of the extracted text.
+        Main function to process the image and extract text.
 
-        Args:
-            image: The image from which to extract text. This should be a format
-                   supported by the pytesseract library.
+        This function performs the following steps:
+        1. Extracts text from the processed image while preserving line breaks.
+        2. Saves the extracted text to an output file.
 
         Returns:
-            A string containing the extracted text with line breaks preserved.
+            int: 0 if text extraction is successful, 1 otherwise.
+        """
+        try:
+            # Open and process the image
+            with Image.open(self.image_path) as image:
+                # Extract text with line preservation
+                extracted_text = self.extract_text_with_lines(image)
+
+                # Save the extracted text to a file
+                with open(self.output_path, 'w', encoding=self.ouput_encoding) as file:
+                    file.write(extracted_text)
+
+                return 0
+
+        except Exception as e:
+            print(f"Error processing image because: {str(e)}")
+            return 1
+
+
+class Program:
+    def __init__(self):
+        """
+        Initializes the OCR4Linux class with the following attributes:
+        - args_num: Number of arguments expected by the script.
+        - author: Author of the script.
+        - email: Author's email address.
+        - github: URL to the GitHub repository.
+        - version: Version of the script.
+        - description: Brief description of the script's functionality.
+        - useges: List of usage examples for the script.
+        - examples: List of example commands for using the script.
+        - arguments: List of arguments that the script accepts with their descriptions.
         """
-        # Get image dimensions
-        custom_config = r'--oem 3 --psm 6'
-        # Extract text with layout preservation
-        data = pytesseract.image_to_data(
-            image, config=custom_config, output_type=pytesseract.Output.DICT)
-
-        # Group text by line
-        lines = {}
-        for i, _ in enumerate(data['level']):
-            if int(data['conf'][i]) > 60:  # Filter low confidence results
-                page_num = data['page_num'][i]
-                block_num = data['block_num'][i]
-                par_num = data['par_num'][i]
-                line_num = data['line_num'][i]
-
-                key = f"{page_num}_{block_num}_{par_num}_{line_num}"
-                if key not in lines:
-                    lines[key] = []
-                lines[key].append(data['text'][i])
-
-        # Join text preserving line breaks
-        return '\n'.join(' '.join(line).strip() for line in lines.values() if ''.join(line).strip())
+        self.args_num = 3
+        self.author = "Mohamed Hussein Al-Adawy"
+        self.email = "mohamed.h.eladwy@gmail.com"
+        self.github = "https://github.com/moheladwy/OCR4Linux"
+        self.version = "1.2.0"
+        self.description = \
+            "    OCR4Linux.py is a Python script that handles image preprocessing\n" + \
+            "    and text extraction using Tesseract OCR. The script takes an input\n" + \
+            "    based on the language in the image."
+        self.useges = [
+            "python OCR4Linux.py <image_path> <output_path>",
+            "python OCR4Linux.py [-h | --help]"
+        ]
+        self.examples = [
+            "python OCR4Linux.py screenshot.png output.txt",
+            "python OCR4Linux.py -h"
+        ]
+        self.arguments = [
+            "file_path:     Path to the python script",
+            "image_path:    Path to the image file",
+            "output_path:   Path to the output text file",
+            "-h, --help:    Display this help message, then exit"
+        ]
 
     def help(self) -> None:
         """
@@ -157,63 +169,94 @@ def help(self) -> None:
         arguments and their descriptions.
 
         Usage:
-            python <script_name> <image_path> <output_path>
+            - python <script_name> <image_path> <output_path>
+            - python <script_name> [-h | --help]
+
+        Example:    
+            - python OCR4Linux.py screenshot.png output.txt
+            - python OCR4Linux.py -h
 
         Arguments:
             file_path: Path to the python script
             image_path: Path to the image file
             output_path: Path to the output text file
+            -h, --help: Display this help message, then exit
         """
-        print(f"Usage: python {self.script_name} <image_path> <output_path>")
+        print("OCR4Linux - OCR for Linux using Tesseract")
+        print(f"Version: {self.version}")
+        print(f"Author:  {self.author}")
+        print(f"Email:   {self.email}")
+        print(f"GitHub:  {self.github}")
+        print()
+        print("Description:")
+        print(self.description)
+        print()
+        print("Usage:")
+        for usege in self.useges:
+            print(f"    - {usege}")
+        print()
+        print("Example:")
+        for example in self.examples:
+            print(f"    - {example}")
+        print()
         print("Arguments:")
-        print("  file_path: Path to the python script")
-        print("  image_path: Path to the image file")
-        print("  output_path: Path to the output text file")
+        for argument in self.arguments:
+            print(f"    {argument}")
 
-    def main(self) -> int:
+    def check_arguments(self) -> bool:
         """
-        Main function to process the image and extract text.
+        Checks the command line arguments for validity.
 
-        This function performs the following steps:
-        1. Checks command line arguments for validity.
-        2. Verifies if the specified image file exists.
-        3. Opens and processes the image.
-        4. Extracts text from the processed image while preserving line breaks.
-        5. Saves the extracted text to an output file.
+        This method checks if the correct number of arguments is provided.
+        If the number of arguments is incorrect, it displays the usage instructions.
 
         Returns:
-            int: 0 if text extraction is successful, 1 otherwise.
+            bool: True if the number of arguments is correct, False otherwise.
         """
-        # Check command line arguments
         if len(sys.argv) != self.args_num or sys.argv[1] in ['-h', '--help']:
             self.help()
-            return 1
+            return False
+        return True
 
-        # Check if file exists
-        if not os.path.exists(self.image_path):
-            print(f"Error: File '{self.image_path}' not found")
-            return 1
+    def check_image_path(self, image_path: str) -> bool:
+        """
+        Checks if the specified image file exists.
 
-        try:
-            # Open and process the image
-            with Image.open(self.image_path) as image:
-                # Preprocess the image
-                processed_image = self.preprocess_image(image)
+        Args:
+            image_path: The path to the image file to be checked.
 
-                # Extract text with line preservation
-                extracted_text = self.extract_text_with_lines(processed_image)
+        Returns:
+            bool: True if the image file exists, False otherwise.
+        """
+        if not os.path.exists(image_path):
+            print(f"Error: File '{image_path}' not found")
+            return False
+        return True
 
-                # Save the extracted text to a file
-                with open(self.output_path, 'w', encoding='utf-8') as file:
-                    file.write(extracted_text)
+    def main(self):
+        """
+        Main function to execute the OCR process.
 
-                print("Text extraction completed successfully")
-                return 0
+        This function performs the following steps:
+        1. Checks if the correct number of arguments is provided.
+        2. Verifies if the image file exists.
+        3. Creates an instance of the TesseractConfig class and runs the OCR process.
 
-        except Exception as e:
-            print(f"Error processing image because: {str(e)}")
+        Returns:
+            int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function.
+        """
+        # Check if the correct number of arguments is provided
+        if not self.check_arguments():
+            return 1
+
+        # Check if the image file exists
+        if not self.check_image_path(sys.argv[1]):
             return 1
 
+        # Create an instance of the TesseractConfig class
+        tesseract = TesseractConfig(sys.argv[1], sys.argv[2])
+        return tesseract.main()
+
 
 if __name__ == "__main__":
-    sys.exit(TesseractConfig().main())
+    sys.exit(Program().main())
diff --git a/OCR4Linux.sh b/OCR4Linux.sh
index 985f2fb..94c9b96 100755
--- a/OCR4Linux.sh
+++ b/OCR4Linux.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # ========================================================================================================================
 # Author: Mohamed Hussein Al-Adawy
-# Version: 1.1.0
+# Version: 1.2.0
 # Description:
 #     OCR4Linux is a versatile text extraction tool for Linux systems that:
 #     1. Takes screenshots of selected areas using:
diff --git a/setup.sh b/setup.sh
index 1e4126d..d29a3d8 100755
--- a/setup.sh
+++ b/setup.sh
@@ -2,7 +2,7 @@
 # ========================================================================================================================
 # Author:
 #     Mohamed Hussein Al-Adawy
-# Version: 1.1.0
+# Version: 1.2.0
 # Description:
 #     This setup script installs and configures OCR4Linux and its dependencies.
 #     It handles the installation of:

From b7849d2f20d88944b058e6501fe2f5b981c9bbfc Mon Sep 17 00:00:00 2001
From: moheladwy <mohamed.h.eladwy@gmail.com>
Date: Mon, 16 Dec 2024 21:40:17 +0200
Subject: [PATCH 2/3] Added languages detection  in the system and set the
 available languages to it, and provided to the extract text function, added
 function to retrieve the available languages, Added the [-l, --list-langs] in
 the available options,  and Updated the help function to include the new
 option.

---
 OCR4Linux.py | 73 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/OCR4Linux.py b/OCR4Linux.py
index e8f7b9b..be45450 100644
--- a/OCR4Linux.py
+++ b/OCR4Linux.py
@@ -77,7 +77,9 @@ def __init__(self, image_path: str, output_path: str):
         self.output_path = output_path
         self.oem_mode = 3  # Default LSTM engine
         self.psm_mode = 6  # Uniform block of text
-        self.langs = "eng+ara"
+        self.available_langs = pytesseract.get_languages()
+        self.langs = '+'.join(filter(None, self.available_langs)
+                              ) if self.available_langs else 'eng'
         self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}'
         self.ouput_encoding = 'utf-8'
 
@@ -148,17 +150,20 @@ def __init__(self):
             "    based on the language in the image."
         self.useges = [
             "python OCR4Linux.py <image_path> <output_path>",
+            "python OCR4Linux.py [-l | --list-langs]",
             "python OCR4Linux.py [-h | --help]"
         ]
         self.examples = [
             "python OCR4Linux.py screenshot.png output.txt",
+            "python OCR4Linux.py -l",
             "python OCR4Linux.py -h"
         ]
         self.arguments = [
-            "file_path:     Path to the python script",
-            "image_path:    Path to the image file",
-            "output_path:   Path to the output text file",
-            "-h, --help:    Display this help message, then exit"
+            "file_path:         Path to the python script",
+            "image_path:        Path to the image file",
+            "output_path:       Path to the output text file",
+            "-l, --list-langs:  List all available languages for OCR in the system",
+            "-h, --help:        Display this help message, then exit"
         ]
 
     def help(self) -> None:
@@ -166,23 +171,9 @@ def help(self) -> None:
         Prints the usage instructions for the OCR4Linux script.
 
         This method displays the correct way to run the script, including the required
-        arguments and their descriptions.
-
-        Usage:
-            - python <script_name> <image_path> <output_path>
-            - python <script_name> [-h | --help]
-
-        Example:    
-            - python OCR4Linux.py screenshot.png output.txt
-            - python OCR4Linux.py -h
-
-        Arguments:
-            file_path: Path to the python script
-            image_path: Path to the image file
-            output_path: Path to the output text file
-            -h, --help: Display this help message, then exit
+        arguments and their descriptions. It also provides examples of how to use the script.
         """
-        print("OCR4Linux - OCR for Linux using Tesseract")
+        print("OCR4Linux - OCR script for Linux using Tesseract")
         print(f"Version: {self.version}")
         print(f"Author:  {self.author}")
         print(f"Email:   {self.email}")
@@ -203,20 +194,41 @@ def help(self) -> None:
         for argument in self.arguments:
             print(f"    {argument}")
 
-    def check_arguments(self) -> bool:
+    def check_arguments(self) -> int:
         """
         Checks the command line arguments for validity.
 
-        This method checks if the correct number of arguments is provided.
-        If the number of arguments is incorrect, it displays the usage instructions.
+        Handles the following options:
+        - Standard usage: <image_path> <output_path>
+        - Help: -h or --help
+        - List languages: -l or --list-langs
 
         Returns:
-            bool: True if the number of arguments is correct, False otherwise.
+            bool: True if arguments are valid, False otherwise.
         """
-        if len(sys.argv) != self.args_num or sys.argv[1] in ['-h', '--help']:
+        if len(sys.argv) == 2 and sys.argv[1] in ['-l', '--list-langs']:
+            self.list_available_languages()
+            return 0
+        elif len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']:
             self.help()
-            return False
-        return True
+            return 0
+        elif len(sys.argv) != self.args_num:
+            self.help()
+            return 1
+        return 2
+
+    def list_available_languages(self) -> None:
+        """
+        Displays all available languages for Tesseract OCR.
+        """
+        langs = pytesseract.get_languages()
+        if not langs:
+            print("Error: No languages found")
+            return
+
+        print("Available languages for OCR:")
+        for lang in langs:
+            print(f"  - {lang}")
 
     def check_image_path(self, image_path: str) -> bool:
         """
@@ -246,8 +258,11 @@ def main(self):
             int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function.
         """
         # Check if the correct number of arguments is provided
-        if not self.check_arguments():
+        result = self.check_arguments()
+        if result == 1:
             return 1
+        elif result == 0:
+            return 0
 
         # Check if the image file exists
         if not self.check_image_path(sys.argv[1]):

From ed58f58d25cd9879d38f6e2a24bb614530440290 Mon Sep 17 00:00:00 2001
From: moheladwy <mohamed.h.eladwy@gmail.com>
Date: Mon, 16 Dec 2024 21:50:26 +0200
Subject: [PATCH 3/3] update .pylintrc

---
 .pylintrc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.pylintrc b/.pylintrc
index 30be330..9adf0f8 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -7,4 +7,6 @@ disable=
     C0411,  # Wrong import order
     E1101,  # No member
     W0612,  # Unused variable
-    W0718   # Broad exception caught
\ No newline at end of file
+    W0718,  # Broad exception caught
+    R1705,  # Unnecessary "else" after "return"
+    R0902,  # Too many instance attributes
\ No newline at end of file