From 9ade74b30b1d47cbae07bddce282735c1a237a70 Mon Sep 17 00:00:00 2001
From: "danny.morton714" <danny.morton714@gmail.com>
Date: Wed, 20 Aug 2025 16:04:57 -0400
Subject: [PATCH 1/3] moved files

---
 src/notebooks/mainNb.ipynb                                      | 2 +-
 src/notebooks/{ => old_notebooks}/cleaning.ipynb                | 0
 .../old_notebooks}/nick_demographicscleaning.ipynb              | 0
 src/notebooks/{ => old_notebooks}/visualization_examples.ipynb  | 0
 src/notebooks/{ => old_notebooks}/worc_cleaning.ipynb           | 0
 src/notebooks/{ => old_notebooks}/worc_employment_plots.ipynb   | 0
 6 files changed, 1 insertion(+), 1 deletion(-)
 rename src/notebooks/{ => old_notebooks}/cleaning.ipynb (100%)
 rename src/{old_files => notebooks/old_notebooks}/nick_demographicscleaning.ipynb (100%)
 rename src/notebooks/{ => old_notebooks}/visualization_examples.ipynb (100%)
 rename src/notebooks/{ => old_notebooks}/worc_cleaning.ipynb (100%)
 rename src/notebooks/{ => old_notebooks}/worc_employment_plots.ipynb (100%)

diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb
index cfe3734..6e8aca4 100644
--- a/src/notebooks/mainNb.ipynb
+++ b/src/notebooks/mainNb.ipynb
@@ -978,7 +978,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 37,
    "id": "688bdf74",
    "metadata": {},
    "outputs": [
diff --git a/src/notebooks/cleaning.ipynb b/src/notebooks/old_notebooks/cleaning.ipynb
similarity index 100%
rename from src/notebooks/cleaning.ipynb
rename to src/notebooks/old_notebooks/cleaning.ipynb
diff --git a/src/old_files/nick_demographicscleaning.ipynb b/src/notebooks/old_notebooks/nick_demographicscleaning.ipynb
similarity index 100%
rename from src/old_files/nick_demographicscleaning.ipynb
rename to src/notebooks/old_notebooks/nick_demographicscleaning.ipynb
diff --git a/src/notebooks/visualization_examples.ipynb b/src/notebooks/old_notebooks/visualization_examples.ipynb
similarity index 100%
rename from src/notebooks/visualization_examples.ipynb
rename to src/notebooks/old_notebooks/visualization_examples.ipynb
diff --git a/src/notebooks/worc_cleaning.ipynb b/src/notebooks/old_notebooks/worc_cleaning.ipynb
similarity index 100%
rename from src/notebooks/worc_cleaning.ipynb
rename to src/notebooks/old_notebooks/worc_cleaning.ipynb
diff --git a/src/notebooks/worc_employment_plots.ipynb b/src/notebooks/old_notebooks/worc_employment_plots.ipynb
similarity index 100%
rename from src/notebooks/worc_employment_plots.ipynb
rename to src/notebooks/old_notebooks/worc_employment_plots.ipynb

From 41c3e77a6310bcc3cc9a456954f50dea6210f3d3 Mon Sep 17 00:00:00 2001
From: dmorton714 <danny.morton714@gmail.com>
Date: Thu, 21 Aug 2025 10:18:41 -0400
Subject: [PATCH 2/3] cleaned up the main class.

---
 src/notebooks/mainNb.ipynb | 572 +++++++++++++++++++++++++++++--------
 1 file changed, 458 insertions(+), 114 deletions(-)

diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb
index 6e8aca4..8e0cbfb 100644
--- a/src/notebooks/mainNb.ipynb
+++ b/src/notebooks/mainNb.ipynb
@@ -6,19 +6,21 @@
    "metadata": {},
    "source": [
     "### **Table of Contents**\n",
+    "\n",
     "    * [**Table of Contents**](#**table-of-contents**)\n",
-    "  * [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
-    "  * [Example usage](#example-usage)\n",
-    "      * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
-    "      * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n",
-    "  * [Update cleaning code](#update-cleaning-code)\n",
-    "  * [Generate report](#generate-report)\n",
-    "  * [Plots](#plots)"
+    "\n",
+    "- [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
+    "- [Example usage](#example-usage)\n",
+    "  - [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
+    "  - [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n",
+    "- [Update cleaning code](#update-cleaning-code)\n",
+    "- [Generate report](#generate-report)\n",
+    "- [Plots](#plots)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 2,
    "id": "d11a2343",
    "metadata": {},
    "outputs": [],
@@ -37,12 +39,12 @@
    "id": "0764cac1",
    "metadata": {},
    "source": [
-    "## Function To Read in the Data! "
+    "## Function To Read in the Data!\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 3,
    "id": "7cd30f44",
    "metadata": {},
    "outputs": [],
@@ -98,7 +100,7 @@
     "\n",
     "        dataframes[p.stem] = df\n",
     "\n",
-    "    return dataframes\n"
+    "    return dataframes"
    ]
   },
   {
@@ -106,17 +108,20 @@
    "id": "714769cf",
    "metadata": {},
    "source": [
-    "## Example usage \n",
+    "## Example usage\n",
     "\n",
-    "```python \n",
+    "```python\n",
     "dfs = load_data_folder()\n",
     "dfs.keys()\n",
     "```\n",
+    "\n",
     "output:\n",
+    "\n",
     "```bash\n",
     "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])\n",
     "```\n",
-    "#### To Access a DataFrame in the list \n",
+    "\n",
+    "#### To Access a DataFrame in the list\n",
     "\n",
     "```python\n",
     "all_demo = dfs['All_demographics_and_programs']\n",
@@ -128,20 +133,18 @@
     "|:--:|:--:|:--:|\n",
     "|3.14|name|apple|\n",
     "\n",
-    "\n",
-    "\n",
     "#### To Remove Spaces in DataFrame name\n",
     "\n",
-    "```python \n",
+    "```python\n",
     "for name, df in dfs.items():\n",
     "    safe_name = name.replace(\" \", \"_\")\n",
     "    globals()[safe_name] = df\n",
-    "```"
+    "```\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 4,
    "id": "15c0e5af",
    "metadata": {},
    "outputs": [
@@ -151,7 +154,7 @@
        "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -166,12 +169,12 @@
    "id": "f643e1d8",
    "metadata": {},
    "source": [
-    "How to call the dataframe from the list above"
+    "How to call the dataframe from the list above\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 5,
    "id": "5875ef3e",
    "metadata": {},
    "outputs": [
@@ -313,7 +316,7 @@
        "4              NaN           NaN  Tech Louisville 21-22  "
       ]
      },
-     "execution_count": 18,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -328,12 +331,12 @@
    "id": "7e00a727",
    "metadata": {},
    "source": [
-    "Little for loop at access the dataframes individually"
+    "Little for loop at access the dataframes individually\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "c3c755a4",
    "metadata": {},
    "outputs": [],
@@ -345,7 +348,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 7,
    "id": "fa63b693",
    "metadata": {},
    "outputs": [
@@ -598,7 +601,7 @@
        "[32230 rows x 12 columns]"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -612,13 +615,14 @@
    "id": "fe6f5506",
    "metadata": {},
    "source": [
-    "## Update cleaning code \n",
-    "- Look at our cleaning code that we have. \n",
-    "- we should start to make changes to it to account for this. \n",
-    "- We need to make it so it so the program doesn't crash when something fails \n",
+    "## Update cleaning code\n",
+    "\n",
+    "- Look at our cleaning code that we have.\n",
+    "- we should start to make changes to it to account for this.\n",
+    "- We need to make it so it so the program doesn't crash when something fails\n",
     "  - [Try Except logic updates](https://www.w3schools.com/python/python_try_except.asp)\n",
     "  - make the messages mean something meaningful\n",
-    "- Ideally we will not drop anything from our data \n"
+    "- Ideally we will not drop anything from our data\n"
    ]
   },
   {
@@ -626,39 +630,100 @@
    "id": "29302c63",
    "metadata": {},
    "source": [
-    "Will update this a bit with usage etc... "
+    "Will update this a bit with usage etc...\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-
+   "execution_count": null,
    "id": "749ae60a",
    "metadata": {},
    "outputs": [],
    "source": [
     "class DataCleaner:\n",
     "    \"\"\"\n",
-    "    General-purpose cleaner for multiple WORC datasets\n",
-    "    (Employment, Enrollments, Demographics).\n",
+    "    A utility class for cleaning and standardizing tabular datasets.\n",
+    "\n",
+    "    This class wraps a pandas DataFrame and provides a set of \n",
+    "    convenience methods for common data cleaning tasks such as:\n",
     "\n",
-    "    Uses try/except for safety (does not break if col missing).\n",
-    "    Keeps all rows (no drops), but fills/fixes when possible.\n",
+    "    - Dropping unnecessary columns.\n",
+    "    - Filling missing values with specified defaults.\n",
+    "    - Replacing or normalizing categorical values.\n",
+    "    - Converting data types safely (including datetime).\n",
+    "    - Standardizing demographic fields (e.g., gender, race).\n",
+    "    - Parsing and normalizing salary values.\n",
+    "\n",
+    "    All methods are designed to fail gracefully:\n",
+    "    - If a target column does not exist, it is skipped.\n",
+    "    - If an operation fails due to incompatible data, a warning \n",
+    "      is printed and the DataFrame remains unchanged.\n",
+    "\n",
+    "    Most methods return `self`, enabling method chaining:\n",
+    "\n",
+    "    Example\n",
+    "    -------\n",
+    "    >>> cleaner = DataCleaner(df)\n",
+    "    >>> clean_df = (\n",
+    "    ...     cleaner\n",
+    "    ...     .drop_columns([\"UnusedCol\"])\n",
+    "    ...     .fillna({\"Age\": 0, \"City\": \"Unknown\"})\n",
+    "    ...     .normalize_gender()\n",
+    "    ...     .normalize_race()\n",
+    "    ...     .clean_salary()\n",
+    "    ...     .finalize()\n",
+    "    ...     )\n",
     "    \"\"\"\n",
     "\n",
     "    def __init__(self, df: pd.DataFrame):\n",
     "        self.df = df.copy()\n",
     "\n",
-    "    def safe_drop_columns(self, cols_to_drop):\n",
-    "        \"\"\"Drop columns if they exist, otherwise ignore.\"\"\"\n",
+    "    def drop_columns(self, cols_to_drop) -> \"Self\":\n",
+    "        \"\"\"\n",
+    "        Drop one or more columns from the DataFrame safely.\n",
+    "\n",
+    "        This method attempts to drop the specified columns. If a column \n",
+    "        does not exist, it is ignored (no error is raised). If dropping \n",
+    "        fails due to another issue (e.g., invalid argument type), a \n",
+    "        warning is printed and the DataFrame is left unchanged.\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        cols_to_drop : str or list of str\n",
+    "            Column name or list of column names to drop.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        Self\n",
+    "            The current instance, allowing method chaining.\n",
+    "        \"\"\"\n",
     "        try:\n",
     "            self.df = self.df.drop(columns=cols_to_drop, errors='ignore')\n",
     "        except Exception as e:\n",
     "            print(f\"[Warning] Failed dropping columns: {e}\")\n",
     "        return self\n",
     "\n",
-    "    def safe_fillna(self, fill_map: dict):\n",
-    "        \"\"\"Fill NaN values for specific columns safely.\"\"\"\n",
+    "    def fillna(self, fill_map: dict) -> \"Self\":\n",
+    "        \"\"\"\n",
+    "        Fill missing (NaN) values in specified columns safely.\n",
+    "\n",
+    "        For each column provided in the mapping, this method replaces \n",
+    "        NaN values with the specified fill value. Columns not present \n",
+    "        in the DataFrame are skipped. If filling fails for a column \n",
+    "        (e.g., due to incompatible data types), a warning is printed \n",
+    "        and that column is left unchanged.\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        fill_map : dict\n",
+    "            A dictionary mapping {column_name: fill_value} pairs.\n",
+    "            Example: {\"age\": 0, \"city\": \"Unknown\"}\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        Self\n",
+    "            The current instance, allowing method chaining.\n",
+    "        \"\"\"\n",
     "        for col, val in fill_map.items():\n",
     "            try:\n",
     "                if col in self.df.columns:\n",
@@ -667,8 +732,31 @@
     "                print(f\"[Warning] Failed filling NaN for {col}: {e}\")\n",
     "        return self\n",
     "\n",
-    "    def safe_replace(self, col, replacements: dict):\n",
-    "        \"\"\"Replace values in a column safely.\"\"\"\n",
+    "    def replace_column_values(self, col: str, replacements: dict) -> \"Self\":\n",
+    "        \"\"\"\n",
+    "        Replace values in a specified DataFrame column using a mapping dictionary.\n",
+    "\n",
+    "        This method attempts to apply the given replacements safely. \n",
+    "        If the column exists, it replaces matching values based on the \n",
+    "        provided mapping. If an error occurs during replacement \n",
+    "        (e.g., invalid mapping or data type mismatch), a warning \n",
+    "        is printed and the DataFrame is left unchanged.\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        col : str\n",
+    "            The name of the column in the DataFrame to modify.\n",
+    "        replacements : dict\n",
+    "            A mapping of {old_value: new_value} pairs to replace.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        Self\n",
+    "            The current instance, allowing method chaining.\n",
+    "        Sample usage:\n",
+    "        >>> cleaner = DataCleaner(df)\n",
+    "        >>> cleaner.replace_column_values(\"status\", {\"yes\": 1, \"no\": 0})\n",
+    "        \"\"\"\n",
     "        try:\n",
     "            if col in self.df.columns:\n",
     "                self.df[col] = self.df[col].replace(replacements)\n",
@@ -676,7 +764,7 @@
     "            print(f\"[Warning] Failed replacing values in {col}: {e}\")\n",
     "        return self\n",
     "\n",
-    "    def safe_convert_dtype(self, col, dtype, errors=\"ignore\"):\n",
+    "    def convert_datetime(self, col, dtype, errors=\"ignore\"):\n",
     "        \"\"\"Convert column dtype safely.\"\"\"\n",
     "        try:\n",
     "            if col in self.df.columns:\n",
@@ -689,8 +777,25 @@
     "            print(f\"[Warning] Failed dtype conversion on {col}: {e}\")\n",
     "        return self\n",
     "\n",
-    "    def normalize_gender(self):\n",
-    "        \"\"\"Unify transgender categories safely.\"\"\"\n",
+    "    def normalize_gender(self) -> \"Self\":\n",
+    "        \"\"\"\n",
+    "        Standardize gender labels in the DataFrame.\n",
+    "\n",
+    "        This method looks for a column named \"Gender\" and replaces \n",
+    "        specific transgender categories with the unified label \n",
+    "        \"Transgender\". If the column does not exist or the replacement \n",
+    "        fails (e.g., due to unexpected data types), the method prints a \n",
+    "        warning and leaves the DataFrame unchanged.\n",
+    "\n",
+    "        Replacements performed:\n",
+    "            - \"Transgender male to female\" → \"Transgender\"\n",
+    "            - \"Transgender female to male\" → \"Transgender\"\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        Self\n",
+    "            The current instance, allowing method chaining.\n",
+    "        \"\"\"\n",
     "        try:\n",
     "            if \"Gender\" in self.df.columns:\n",
     "                self.df[\"Gender\"] = self.df[\"Gender\"].replace({\n",
@@ -701,18 +806,35 @@
     "            print(f\"[Warning] Failed gender normalization: {e}\")\n",
     "        return self\n",
     "\n",
-    "    def split_race(self):\n",
-    "        \"\"\"Split Race column into Race_1, Race_2, etc., if it exists.\"\"\"\n",
+    "    def normalize_race(self) -> \"Self\":\n",
+    "        \"\"\"\n",
+    "        Normalize the 'Race' column so that multi-value entries are \n",
+    "        collapsed into a single category \"Two or More Races\".\n",
+    "\n",
+    "        Behavior\n",
+    "        --------\n",
+    "        - Single race values are kept as-is.\n",
+    "        - Multi-value entries separated by \";\" or \",\" are replaced with\n",
+    "        \"Two or More Races\".\n",
+    "\n",
+    "        Example\n",
+    "        -------\n",
+    "        Original: \"White;Asian\" → \"Two or More Races\"\n",
+    "        Original: \"White,Asian\" → \"Two or More Races\"\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        Self\n",
+    "            The current instance, allowing method chaining.\n",
+    "        \"\"\"\n",
     "        try:\n",
     "            if \"Race\" in self.df.columns:\n",
-    "                splitting = self.df[\"Race\"].astype(\n",
-    "                    str).str.split(\";\", expand=True)\n",
-    "                splitting.columns = [\n",
-    "                    f\"Race_{i+1}\" for i in range(splitting.shape[1])]\n",
-    "                self.df = pd.concat(\n",
-    "                    [self.df.drop(columns=[\"Race\"]), splitting], axis=1)\n",
+    "                self.df[\"Race\"] = self.df[\"Race\"].astype(str).apply(\n",
+    "                    lambda x: \"Two or More Races\" if (\n",
+    "                        \";\" in x or \",\" in x) else x\n",
+    "                )\n",
     "        except Exception as e:\n",
-    "            print(f\"[Warning] Failed race splitting: {e}\")\n",
+    "            print(f\"[Warning] Failed race normalization: {e}\")\n",
     "        return self\n",
     "\n",
     "    def clean_salary(self, hours_per_year: int = 2080):\n",
@@ -720,13 +842,14 @@
     "        Clean and standardize salary values in the DataFrame.\n",
     "\n",
     "        Steps performed:\n",
-    "        1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → \"50000\").\n",
+    "        1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → 50000).\n",
     "        2. Handle ranges by converting them to the average value \n",
-    "           (e.g., \"50,000-70,000\" → 60000).\n",
-    "        3. Convert values to numeric, coercing invalid entries to NaN.\n",
-    "        4. Treat values < 200 as hourly wages and convert to annual salaries \n",
-    "           (multiplied by `hours_per_year`).\n",
-    "        5. Drop unrealistic values greater than 1,000,000 (set to NaN).\n",
+    "            (e.g., \"50,000–70,000\" → 60000).\n",
+    "        3. Handle shorthand \"M\" (e.g., \"$1.5M\" → 1,500,000).\n",
+    "        4. Convert values to numeric, coercing invalid entries to NaN.\n",
+    "        5. Treat values <= 200 as hourly wages and convert to annual salaries \n",
+    "            (multiplied by `hours_per_year`).\n",
+    "        6. Drop unrealistic values greater than 1,000,000 (set to NaN).\n",
     "\n",
     "        Parameters\n",
     "        ----------\n",
@@ -741,12 +864,18 @@
     "        try:\n",
     "            if \"Salary\" in self.df.columns:\n",
     "                self.df[\"Salary\"] = self.df[\"Salary\"].astype(str)\n",
+    "\n",
     "                def parse_salary(val: str):\n",
     "                    val = val.strip()\n",
+    "                    if not val or val.lower() in {\"nan\", \"none\"}:\n",
+    "                        return None\n",
+    "\n",
+    "                    # Normalize dash types (hyphen, en dash, em dash \"-\")\n",
+    "                    val = re.sub(r\"[–—]\", \"-\", val)\n",
     "\n",
-    "                    # Handle range like \"50k-70k\" or \"50,000–70,000\"\n",
-    "                    if \"-\" in val or \"–\" in val:\n",
-    "                        parts = re.split(r\"[-–]\", val)\n",
+    "                    # Handle range like \"50k-70k\" or \"50,000-70,000\"\n",
+    "                    if \"-\" in val:\n",
+    "                        parts = val.split(\"-\")\n",
     "                        nums = [parse_salary(p) for p in parts if p.strip()]\n",
     "                        nums = [n for n in nums if n is not None]\n",
     "                        return sum(nums) / len(nums) if nums else None\n",
@@ -754,12 +883,17 @@
     "                    # Remove $, commas, spaces\n",
     "                    val = re.sub(r\"[\\$,]\", \"\", val)\n",
     "\n",
-    "                    # Handle shorthand k/K (e.g., 50k -> 50000)\n",
-    "                    match = re.match(r\"(\\d+(\\.\\d+)?)([kK])\", val)\n",
-    "                    if match:\n",
-    "                        return float(match.group(1)) * 1000\n",
+    "                    # Handle shorthand k/K (e.g., \"50k\" → 50000)\n",
+    "                    match_k = re.match(r\"^(\\d+(\\.\\d+)?)[kK]$\", val)\n",
+    "                    if match_k:\n",
+    "                        return float(match_k.group(1)) * 1000\n",
+    "\n",
+    "                    # Handle shorthand M (e.g., \"1.5M\" → 1500000)\n",
+    "                    match_m = re.match(r\"^(\\d+(\\.\\d+)?)[mM]$\", val)\n",
+    "                    if match_m:\n",
+    "                        return float(match_m.group(1)) * 1_000_000\n",
     "\n",
-    "                    # Convert plain number if possible\n",
+    "                    # Plain number (integer or float)\n",
     "                    try:\n",
     "                        return float(val)\n",
     "                    except ValueError:\n",
@@ -769,7 +903,8 @@
     "                self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n",
     "\n",
     "                # Convert small numbers (hourly) to annual\n",
-    "                self.df.loc[self.df[\"Salary\"] < 200, \"Salary\"] *= hours_per_year\n",
+    "                self.df.loc[self.df[\"Salary\"] <=\n",
+    "                            200, \"Salary\"] *= hours_per_year\n",
     "\n",
     "                # Drop unrealistic salaries\n",
     "                self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n",
@@ -779,23 +914,226 @@
     "\n",
     "        return self\n",
     "\n",
-    "    def finalize(self):\n",
-    "        \"\"\"Return cleaned dataframe.\"\"\"\n",
+    "    def finalize(self) -> pd.DataFrame:\n",
+    "        \"\"\"\n",
+    "        Finalize and return the cleaned DataFrame.\n",
+    "\n",
+    "        This method should be called at the end of a cleaning pipeline \n",
+    "        to retrieve the fully processed DataFrame after all applied \n",
+    "        transformations.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        pd.DataFrame\n",
+    "            The cleaned and transformed DataFrame.\n",
+    "        \"\"\"\n",
     "        return self.df"
    ]
   },
   {
    "cell_type": "markdown",
-
+   "id": "d84a5b95",
+   "metadata": {},
+   "source": [
+    "# example usage of each method\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "329da719",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/2d/yt4_w6zn5pbfjg_jx5sdmm180000gn/T/ipykernel_89024/677290603.py:130: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  self.df[col] = pd.to_datetime(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Auto Id</th>\n",
+       "      <th>Gender</th>\n",
+       "      <th>Race</th>\n",
+       "      <th>Ethnicity Hispanic/Latino</th>\n",
+       "      <th>Outcome</th>\n",
+       "      <th>Veteran</th>\n",
+       "      <th>Ex-Offender</th>\n",
+       "      <th>Justice Involved</th>\n",
+       "      <th>Single Parent</th>\n",
+       "      <th>Program: Program Name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Reimage 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Asian</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Successfully Completed</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Tech Louisville 21-22</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>NaT</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>Black or African American</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Tech Louisville 21-22</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  Auto Id Gender                       Race Ethnicity Hispanic/Latino  \\\n",
+       "0     NaT   Male  Black or African American                   Unknown   \n",
+       "1     NaT   Male  Black or African American                   Unknown   \n",
+       "2     NaT   Male  Black or African American                   Unknown   \n",
+       "3     NaT   Male                      Asian                   Unknown   \n",
+       "4     NaT   Male  Black or African American                   Unknown   \n",
+       "\n",
+       "                  Outcome Veteran Ex-Offender Justice Involved Single Parent  \\\n",
+       "0                 Unknown       0     Unknown          Unknown       Unknown   \n",
+       "1                 Unknown       0     Unknown          Unknown       Unknown   \n",
+       "2                 Unknown       0     Unknown          Unknown       Unknown   \n",
+       "3  Successfully Completed       0     Unknown               No       Unknown   \n",
+       "4                 Unknown      -1     Unknown          Unknown       Unknown   \n",
+       "\n",
+       "   Program: Program Name  \n",
+       "0          Reimage 21-22  \n",
+       "1          Reimage 21-22  \n",
+       "2          Reimage 21-22  \n",
+       "3  Tech Louisville 21-22  \n",
+       "4  Tech Louisville 21-22  "
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cleaner = DataCleaner(all_demo)\n",
+    "\n",
+    "clean_df = (\n",
+    "    cleaner\n",
+    "    # 1. Drop unneeded columns\n",
+    "    .drop_columns([\"First Name\", \"Last Name\"])\n",
+    "\n",
+    "    # 2. Fill missing values\n",
+    "    .fillna({\n",
+    "        \"Outcome\": \"Unknown\",\n",
+    "        \"Veteran\": \"Unknown\",\n",
+    "        \"Ex-Offender\": \"Unknown\",\n",
+    "        \"Justice Involved\": \"Unknown\",\n",
+    "        \"Single Parent\": \"Unknown\",\n",
+    "        \"Ethnicity Hispanic/Latino\": \"Unknown\"\n",
+    "    })\n",
+    "\n",
+    "    # 3. Replace specific column values\n",
+    "    .replace_column_values(\"Veteran\", {\"No\": 0, \"Yes\": 1, \"Unknown\": -1})\n",
+    "\n",
+    "    # 4. Convert a column to datetime (pretend Auto Id is a date code)\n",
+    "    .convert_datetime(\"Auto Id\", \"datetime64[ns]\")  # will fail gracefully\n",
+    "\n",
+    "    # 5. Normalize gender labels\n",
+    "    .normalize_gender()\n",
+    "\n",
+    "    # 6. Normalize race column (collapse multi-value)\n",
+    "    .normalize_race()\n",
+    "\n",
+    "    # 7. Clean salary column\n",
+    "    .clean_salary()\n",
+    "\n",
+    "    # 8. Finalize and return cleaned DataFrame\n",
+    "    .finalize()\n",
+    ")\n",
+    "\n",
+    "clean_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "id": "3eb6373f",
    "metadata": {},
    "source": [
-    "### Sample use of the clean_salary function. "
+    "### Sample use of the clean_salary function.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 11,
    "id": "182eac4a",
    "metadata": {},
    "outputs": [
@@ -811,7 +1149,7 @@
       "4    75000.0\n",
       "5   100000.0\n",
       "6   150000.0\n",
-      "7      200.0\n",
+      "7   416000.0\n",
       "8     3000.0\n",
       "9        NaN\n",
       "10       NaN\n",
@@ -838,7 +1176,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 12,
    "id": "82806fc9",
    "metadata": {},
    "outputs": [
@@ -856,7 +1194,7 @@
       "6      NaN\n",
       "7      NaN\n",
       "8      NaN\n",
-      "9      NaN\n"
+      "9  75000.0\n"
      ]
     }
    ],
@@ -961,7 +1299,8 @@
     "                self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n",
     "\n",
     "                # Convert small numbers (hourly) to annual\n",
-    "                self.df.loc[self.df[\"Salary\"] <= 200, \"Salary\"] *= hours_per_year\n",
+    "                self.df.loc[self.df[\"Salary\"] <=\n",
+    "                            200, \"Salary\"] *= hours_per_year\n",
     "\n",
     "                # Drop unrealistic salaries\n",
     "                self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n",
@@ -973,12 +1312,12 @@
     "\n",
     "    def finalize(self):\n",
     "        \"\"\"Return cleaned dataframe.\"\"\"\n",
-    "        return self.df\n"
+    "        return self.df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
    "id": "688bdf74",
    "metadata": {},
    "outputs": [
@@ -994,16 +1333,16 @@
     "# Test DataFrame with edge/fail cases\n",
     "fail_df = pd.DataFrame({\n",
     "    \"Salary\": [\n",
-    "        None,                #  NaN\n",
-    "        \"\",                  #  NaN\n",
-    "        \" \",                 #  NaN\n",
-    "        \"abc123\",            #  NaN\n",
-    "        \"50k-abc\",           #  50000.0\n",
-    "        \"$-5000\",            #  -5000.0  (still allowed for now)\n",
-    "        \"∞\",                 #  NaN\n",
-    "        \"NaN\",               #  NaN\n",
-    "        \"$1.5M\",             #  NaN ( >1,000,000 rule)\n",
-    "        \"70,000—80,000\"      #  75000.0 (dash normalized)\n",
+    "        None,  # NaN\n",
+    "        \"\",  # NaN\n",
+    "        \" \",  # NaN\n",
+    "        \"abc123\",  # NaN\n",
+    "        \"50k-abc\",  # 50000.0\n",
+    "        \"$-5000\",  # -5000.0  (still allowed for now)\n",
+    "        \"∞\",  # NaN\n",
+    "        \"NaN\",  # NaN\n",
+    "        \"$1.5M\",  # NaN ( >1,000,000 rule)\n",
+    "        \"70,000—80,000\"  # 75000.0 (dash normalized)\n",
     "    ]\n",
     "})\n",
     "\n",
@@ -1037,15 +1376,15 @@
    "id": "6ddbb4c0",
    "metadata": {},
    "source": [
-    "## Generate report \n",
+    "## Generate report\n",
     "\n",
     "- Overall completion of program only accounting for the new style of classes m1-m4\n",
-    "- completion by year \n",
-    "- completion over all by pathway \n",
-    "- completion by year by pathway \n",
-    "- Feel free to get creative here adding gender etc to get us a better understanding \n",
-    "- education level and the above... \n",
-    "- export this as a txt file "
+    "- completion by year\n",
+    "- completion over all by pathway\n",
+    "- completion by year by pathway\n",
+    "- Feel free to get creative here adding gender etc to get us a better understanding\n",
+    "- education level and the above...\n",
+    "- export this as a txt file\n"
    ]
   },
   {
@@ -1061,15 +1400,16 @@
    "id": "859cf674",
    "metadata": {},
    "source": [
-    "## Plots \n",
-    "- Look at the various plots \n",
+    "## Plots\n",
+    "\n",
+    "- Look at the various plots\n",
     "- make a consistent color scheme\n",
-    "- pick the plots that go with the report above \n",
-    "- make missing plots \n",
+    "- pick the plots that go with the report above\n",
+    "- make missing plots\n",
     "- make plots have the option to show & save in the functions\n",
     "\n",
     "see `src/notebooks/visualization_examples.ipynb`\n",
-    "See below from `src/Carmen_WORCEmployment_Plots.py`"
+    "See below from `src/Carmen_WORCEmployment_Plots.py`\n"
    ]
   },
   {
@@ -1088,13 +1428,15 @@
     "\n",
     "def plot_avg_salary_by_city(data):\n",
     "    region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()\n",
-    "    region_salary.plot(kind='barh', figsize=(8, 5), title=\"Average Salary by KY Region\")\n",
+    "    region_salary.plot(kind='barh', figsize=(\n",
+    "        8, 5), title=\"Average Salary by KY Region\")\n",
     "    plt.xlabel(\"Average Salary\")\n",
     "    plt.show()\n",
     "\n",
     "\n",
     "def plot_placements_over_time(data):\n",
-    "    data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))\n",
+    "    data.set_index('Start Date').resample('M').size().plot(\n",
+    "        kind='line', marker='o', figsize=(10, 4))\n",
     "    plt.title(\"Number of Placements Over Time\")\n",
     "    plt.ylabel(\"Placements\")\n",
     "    plt.show()\n",
@@ -1102,7 +1444,8 @@
     "\n",
     "def plot_placement_type_by_program(data):\n",
     "    plt.figure(figsize=(10, 6))\n",
-    "    sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')\n",
+    "    sns.countplot(data=data, x='ATP Placement Type',\n",
+    "                  hue='Program: Program Name')\n",
     "    plt.xticks(rotation=45)\n",
     "    plt.title(\"Placement Type by Program\")\n",
     "    plt.show()\n",
@@ -1110,7 +1453,8 @@
     "\n",
     "def plot_top_cities(data):\n",
     "    city_counts = data['Mailing City'].value_counts().head(10)\n",
-    "    city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))\n",
+    "    city_counts.plot(\n",
+    "        kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))\n",
     "    plt.ylabel(\"Count\")\n",
     "    plt.show()"
    ]
@@ -1120,12 +1464,12 @@
    "id": "f905708f",
    "metadata": {},
    "source": [
-    "TOC generator "
+    "TOC generator\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "d4fc7116",
    "metadata": {},
    "outputs": [
@@ -1182,13 +1526,13 @@
     "\n",
     "\n",
     "notebook_path = 'mainNb.ipynb'\n",
-    "generate_toc_from_notebook(notebook_path)\n"
+    "generate_toc_from_notebook(notebook_path)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "venv (3.12.2)",
    "language": "python",
    "name": "python3"
   },
@@ -1202,7 +1546,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.1"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,

From 27ec9463f45bfb47558428bbee36c6ab355abc2f Mon Sep 17 00:00:00 2001
From: dmorton714 <danny.morton714@gmail.com>
Date: Thu, 21 Aug 2025 10:43:24 -0400
Subject: [PATCH 3/3] added doc strings

---
 src/notebooks/mainNb.ipynb | 109 +++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 4 deletions(-)

diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb
index 8e0cbfb..3d7669e 100644
--- a/src/notebooks/mainNb.ipynb
+++ b/src/notebooks/mainNb.ipynb
@@ -7,8 +7,6 @@
    "source": [
     "### **Table of Contents**\n",
     "\n",
-    "    * [**Table of Contents**](#**table-of-contents**)\n",
-    "\n",
     "- [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
     "- [Example usage](#example-usage)\n",
     "  - [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
@@ -44,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "7cd30f44",
    "metadata": {},
    "outputs": [],
@@ -610,6 +608,76 @@
     "All_demographics_and_programs"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b83d9a39",
+   "metadata": {},
+   "source": [
+    "Should we switch to this rather than the 2 step process above?\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a009686",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_data_folder(\n",
+    "    folder_path: Union[str, os.PathLike] = \"../../data\",\n",
+    "    safe_names: bool = False\n",
+    ") -> Dict[str, pd.DataFrame]:\n",
+    "    \"\"\"\n",
+    "    Load all CSV/XLS/XLSX files in a folder into pandas DataFrames.\n",
+    "    ...\n",
+    "    safe_names : bool, optional\n",
+    "        If True, replace spaces in filenames with underscores for dict keys.\n",
+    "    \"\"\"\n",
+    "    path = Path(folder_path)\n",
+    "    if not path.exists():\n",
+    "        raise FileNotFoundError(f\"Folder not found: {path.resolve()}\")\n",
+    "\n",
+    "    dataframes: Dict[str, pd.DataFrame] = {}\n",
+    "    for p in path.iterdir():\n",
+    "        if not p.is_file():\n",
+    "            continue\n",
+    "\n",
+    "        ext = p.suffix.lower()\n",
+    "        if ext == \".csv\":\n",
+    "            df = pd.read_csv(p)\n",
+    "        elif ext in {\".xlsx\", \".xls\"}:\n",
+    "            df = pd.read_excel(p)\n",
+    "        else:\n",
+    "            continue\n",
+    "\n",
+    "        key = p.stem.replace(\" \", \"_\") if safe_names else p.stem\n",
+    "        dataframes[key] = df\n",
+    "\n",
+    "    return dataframes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce7ffc41",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dfs = load_data_folder(safe_names=True)\n",
+    "dfs.keys()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "fe6f5506",
@@ -765,7 +833,40 @@
     "        return self\n",
     "\n",
     "    def convert_datetime(self, col, dtype, errors=\"ignore\"):\n",
-    "        \"\"\"Convert column dtype safely.\"\"\"\n",
+    "        \"\"\"\n",
+    "        Convert a column to a specified dtype, with special handling for datetimes.\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        col : str\n",
+    "            Name of the column to convert.\n",
+    "        dtype : str or type\n",
+    "            Target dtype. If the string contains \"datetime\", the method will use\n",
+    "            `pandas.to_datetime` for conversion. Otherwise, it uses `.astype()`.\n",
+    "        errors : {\"ignore\", \"raise\", \"coerce\"}, default \"ignore\"\n",
+    "            Error handling behavior:\n",
+    "            - \"ignore\": invalid parsing will return the original input.\n",
+    "            - \"raise\": raises an exception on invalid parsing.\n",
+    "            - \"coerce\": invalid parsing will be set as NaT (for datetime) or NaN.\n",
+    "\n",
+    "        Returns\n",
+    "        -------\n",
+    "        self : DataFrameCleaner\n",
+    "            The instance with the modified DataFrame, allowing for method chaining.\n",
+    "\n",
+    "        Notes\n",
+    "        -----\n",
+    "        - For datetime conversion, the method forces `errors=\"coerce\"` to ensure\n",
+    "        invalid values are converted to NaT instead of raising.\n",
+    "        - For non-datetime conversions, the provided `errors` argument is passed\n",
+    "        directly to `.astype()`.\n",
+    "        - If the column does not exist, no action is taken.\n",
+    "\n",
+    "        Examples\n",
+    "        --------\n",
+    "        >>> cleaner.convert_datetime(\"StartDate\", \"datetime64[ns]\")\n",
+    "        >>> cleaner.convert_datetime(\"Age\", \"int\", errors=\"coerce\")\n",
+    "        \"\"\"\n",
     "        try:\n",
     "            if col in self.df.columns:\n",
     "                if \"datetime\" in str(dtype):\n",