From 2adf9d0df2e2ebf9dcedf002176b07beaeed9a15 Mon Sep 17 00:00:00 2001
From: Dark_Nemesis <152402476+Ki1er@users.noreply.github.com>
Date: Sat, 19 Oct 2024 00:58:06 +0530
Subject: [PATCH] Implement Input Validation and Sanitization

- Added `is_valid_url()` function: Uses `urllib.parse.urlparse` to validate URL format, ensuring it includes a valid scheme and network location.
- Added `sanitize_locator()` function: Sanitizes element locators using a regular expression to remove potentially harmful characters, allowing only alphanumeric characters, periods, hash symbols, and square brackets.
- Modified `scrape_website()` function:  Now uses the `is_valid_url()` function to validate user-provided URLs and the `sanitize_locator()` function to sanitize element locators before use.  Improved input prompts to guide users.
---
 scraper.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/scraper.py b/scraper.py
index 435f4b6..272b12b 100644
--- a/scraper.py
+++ b/scraper.py
@@ -1,9 +1,21 @@
+import re
+from urllib.parse import urlparse  # Import urlparse
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 
+def is_valid_url(url):  # Include this function
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except ValueError:
+        return False
+
+def sanitize_locator(locator):   # Include this function
+    sanitized_locator = re.sub(r"[^a-zA-Z0-9.#\[\]]", "", locator)
+    return sanitized_locator
 
 # Function to initialize the Selenium WebDriver
 def init_driver(headless=True):
@@ -46,8 +58,16 @@ def find_element(driver, strategy, value):
 # Function to scrape a webpage
 def scrape_website():
     driver = init_driver(headless=True)
-    websiteInp = input("Enter URL for desired website to scrape: ")
-    elementInp = input("Enter the class_name of the desired element to scrape: ")
+
+    while True: # loop until valid url is entered
+        websiteInp = input("Enter URL for desired website to scrape: ")
+         if is_valid_url(websiteInp):  
+            break
+        else:
+            print("Invalid URL.  Please enter a valid URL.")
+    
+    elementInp = input("Enter the class_name (alphanumeric, ., #, [] allowed): ")
+    sanitized_element = sanitize_locator(elementInp) # Sanitize the input
     # testWebsite = "https://scrapingclub.com/"
     # testElement = "w-full.rounded.border"
     try:
@@ -58,7 +78,7 @@ def scrape_website():
 
         # Find elements using different strategies
         exercise1_card = find_element(
-            driver, "class_name", elementInp
+            driver, "class_name", sanitized_element
         )  # Use testElement here instead of elementInp
         # you can also use:
         # exercise1_card = find_element(driver, 'css_selector', '.w-full.rounded.border')