From 9ade74b30b1d47cbae07bddce282735c1a237a70 Mon Sep 17 00:00:00 2001 From: "danny.morton714" Date: Wed, 20 Aug 2025 16:04:57 -0400 Subject: [PATCH 1/3] moved files --- src/notebooks/mainNb.ipynb | 2 +- src/notebooks/{ => old_notebooks}/cleaning.ipynb | 0 .../old_notebooks}/nick_demographicscleaning.ipynb | 0 src/notebooks/{ => old_notebooks}/visualization_examples.ipynb | 0 src/notebooks/{ => old_notebooks}/worc_cleaning.ipynb | 0 src/notebooks/{ => old_notebooks}/worc_employment_plots.ipynb | 0 6 files changed, 1 insertion(+), 1 deletion(-) rename src/notebooks/{ => old_notebooks}/cleaning.ipynb (100%) rename src/{old_files => notebooks/old_notebooks}/nick_demographicscleaning.ipynb (100%) rename src/notebooks/{ => old_notebooks}/visualization_examples.ipynb (100%) rename src/notebooks/{ => old_notebooks}/worc_cleaning.ipynb (100%) rename src/notebooks/{ => old_notebooks}/worc_employment_plots.ipynb (100%) diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb index cfe3734..6e8aca4 100644 --- a/src/notebooks/mainNb.ipynb +++ b/src/notebooks/mainNb.ipynb @@ -978,7 +978,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "688bdf74", "metadata": {}, "outputs": [ diff --git a/src/notebooks/cleaning.ipynb b/src/notebooks/old_notebooks/cleaning.ipynb similarity index 100% rename from src/notebooks/cleaning.ipynb rename to src/notebooks/old_notebooks/cleaning.ipynb diff --git a/src/old_files/nick_demographicscleaning.ipynb b/src/notebooks/old_notebooks/nick_demographicscleaning.ipynb similarity index 100% rename from src/old_files/nick_demographicscleaning.ipynb rename to src/notebooks/old_notebooks/nick_demographicscleaning.ipynb diff --git a/src/notebooks/visualization_examples.ipynb b/src/notebooks/old_notebooks/visualization_examples.ipynb similarity index 100% rename from src/notebooks/visualization_examples.ipynb rename to src/notebooks/old_notebooks/visualization_examples.ipynb diff --git a/src/notebooks/worc_cleaning.ipynb b/src/notebooks/old_notebooks/worc_cleaning.ipynb similarity index 100% rename from src/notebooks/worc_cleaning.ipynb rename to src/notebooks/old_notebooks/worc_cleaning.ipynb diff --git a/src/notebooks/worc_employment_plots.ipynb b/src/notebooks/old_notebooks/worc_employment_plots.ipynb similarity index 100% rename from src/notebooks/worc_employment_plots.ipynb rename to src/notebooks/old_notebooks/worc_employment_plots.ipynb From 41c3e77a6310bcc3cc9a456954f50dea6210f3d3 Mon Sep 17 00:00:00 2001 From: dmorton714 Date: Thu, 21 Aug 2025 10:18:41 -0400 Subject: [PATCH 2/3] cleaned up the main class. --- src/notebooks/mainNb.ipynb | 572 +++++++++++++++++++++++++++++-------- 1 file changed, 458 insertions(+), 114 deletions(-) diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb index 6e8aca4..8e0cbfb 100644 --- a/src/notebooks/mainNb.ipynb +++ b/src/notebooks/mainNb.ipynb @@ -6,19 +6,21 @@ "metadata": {}, "source": [ "### **Table of Contents**\n", + "\n", " * [**Table of Contents**](#**table-of-contents**)\n", - " * [Function To Read in the Data!](#function-to-read-in-the-data!)\n", - " * [Example usage](#example-usage)\n", - " * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n", - " * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n", - " * [Update cleaning code](#update-cleaning-code)\n", - " * [Generate report](#generate-report)\n", - " * [Plots](#plots)" + "\n", + "- [Function To Read in the Data!](#function-to-read-in-the-data!)\n", + "- [Example usage](#example-usage)\n", + " - [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n", + " - [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n", + "- [Update cleaning code](#update-cleaning-code)\n", + "- [Generate report](#generate-report)\n", + "- [Plots](#plots)\n" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 2, "id": "d11a2343", "metadata": {}, "outputs": [], @@ -37,12 +39,12 @@ "id": "0764cac1", "metadata": {}, "source": [ - "## Function To Read in the Data! " + "## Function To Read in the Data!\n" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "id": "7cd30f44", "metadata": {}, "outputs": [], @@ -98,7 +100,7 @@ "\n", " dataframes[p.stem] = df\n", "\n", - " return dataframes\n" + " return dataframes" ] }, { @@ -106,17 +108,20 @@ "id": "714769cf", "metadata": {}, "source": [ - "## Example usage \n", + "## Example usage\n", "\n", - "```python \n", + "```python\n", "dfs = load_data_folder()\n", "dfs.keys()\n", "```\n", + "\n", "output:\n", + "\n", "```bash\n", "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])\n", "```\n", - "#### To Access a DataFrame in the list \n", + "\n", + "#### To Access a DataFrame in the list\n", "\n", "```python\n", "all_demo = dfs['All_demographics_and_programs']\n", @@ -128,20 +133,18 @@ "|:--:|:--:|:--:|\n", "|3.14|name|apple|\n", "\n", - "\n", - "\n", "#### To Remove Spaces in DataFrame name\n", "\n", - "```python \n", + "```python\n", "for name, df in dfs.items():\n", " safe_name = name.replace(\" \", \"_\")\n", " globals()[safe_name] = df\n", - "```" + "```\n" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "id": "15c0e5af", "metadata": {}, "outputs": [ @@ -151,7 +154,7 @@ "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])" ] }, - "execution_count": 16, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -166,12 +169,12 @@ "id": "f643e1d8", "metadata": {}, "source": [ - "How to call the dataframe from the list above" + "How to call the dataframe from the list above\n" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "id": "5875ef3e", "metadata": {}, "outputs": [ @@ -313,7 +316,7 @@ "4 NaN NaN Tech Louisville 21-22 " ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -328,12 +331,12 @@ "id": "7e00a727", "metadata": {}, "source": [ - "Little for loop at access the dataframes individually" + "Little for loop at access the dataframes individually\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "c3c755a4", "metadata": {}, "outputs": [], @@ -345,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 7, "id": "fa63b693", "metadata": {}, "outputs": [ @@ -598,7 +601,7 @@ "[32230 rows x 12 columns]" ] }, - "execution_count": 21, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -612,13 +615,14 @@ "id": "fe6f5506", "metadata": {}, "source": [ - "## Update cleaning code \n", - "- Look at our cleaning code that we have. \n", - "- we should start to make changes to it to account for this. \n", - "- We need to make it so it so the program doesn't crash when something fails \n", + "## Update cleaning code\n", + "\n", + "- Look at our cleaning code that we have.\n", + "- we should start to make changes to it to account for this.\n", + "- We need to make it so it so the program doesn't crash when something fails\n", " - [Try Except logic updates](https://www.w3schools.com/python/python_try_except.asp)\n", " - make the messages mean something meaningful\n", - "- Ideally we will not drop anything from our data \n" + "- Ideally we will not drop anything from our data\n" ] }, { @@ -626,39 +630,100 @@ "id": "29302c63", "metadata": {}, "source": [ - "Will update this a bit with usage etc... " + "Will update this a bit with usage etc...\n" ] }, { "cell_type": "code", - "execution_count": 16, - + "execution_count": null, "id": "749ae60a", "metadata": {}, "outputs": [], "source": [ "class DataCleaner:\n", " \"\"\"\n", - " General-purpose cleaner for multiple WORC datasets\n", - " (Employment, Enrollments, Demographics).\n", + " A utility class for cleaning and standardizing tabular datasets.\n", + "\n", + " This class wraps a pandas DataFrame and provides a set of \n", + " convenience methods for common data cleaning tasks such as:\n", "\n", - " Uses try/except for safety (does not break if col missing).\n", - " Keeps all rows (no drops), but fills/fixes when possible.\n", + " - Dropping unnecessary columns.\n", + " - Filling missing values with specified defaults.\n", + " - Replacing or normalizing categorical values.\n", + " - Converting data types safely (including datetime).\n", + " - Standardizing demographic fields (e.g., gender, race).\n", + " - Parsing and normalizing salary values.\n", + "\n", + " All methods are designed to fail gracefully:\n", + " - If a target column does not exist, it is skipped.\n", + " - If an operation fails due to incompatible data, a warning \n", + " is printed and the DataFrame remains unchanged.\n", + "\n", + " Most methods return `self`, enabling method chaining:\n", + "\n", + " Example\n", + " -------\n", + " >>> cleaner = DataCleaner(df)\n", + " >>> clean_df = (\n", + " ... cleaner\n", + " ... .drop_columns([\"UnusedCol\"])\n", + " ... .fillna({\"Age\": 0, \"City\": \"Unknown\"})\n", + " ... .normalize_gender()\n", + " ... .normalize_race()\n", + " ... .clean_salary()\n", + " ... .finalize()\n", + " ... )\n", " \"\"\"\n", "\n", " def __init__(self, df: pd.DataFrame):\n", " self.df = df.copy()\n", "\n", - " def safe_drop_columns(self, cols_to_drop):\n", - " \"\"\"Drop columns if they exist, otherwise ignore.\"\"\"\n", + " def drop_columns(self, cols_to_drop) -> \"Self\":\n", + " \"\"\"\n", + " Drop one or more columns from the DataFrame safely.\n", + "\n", + " This method attempts to drop the specified columns. If a column \n", + " does not exist, it is ignored (no error is raised). If dropping \n", + " fails due to another issue (e.g., invalid argument type), a \n", + " warning is printed and the DataFrame is left unchanged.\n", + "\n", + " Parameters\n", + " ----------\n", + " cols_to_drop : str or list of str\n", + " Column name or list of column names to drop.\n", + "\n", + " Returns\n", + " -------\n", + " Self\n", + " The current instance, allowing method chaining.\n", + " \"\"\"\n", " try:\n", " self.df = self.df.drop(columns=cols_to_drop, errors='ignore')\n", " except Exception as e:\n", " print(f\"[Warning] Failed dropping columns: {e}\")\n", " return self\n", "\n", - " def safe_fillna(self, fill_map: dict):\n", - " \"\"\"Fill NaN values for specific columns safely.\"\"\"\n", + " def fillna(self, fill_map: dict) -> \"Self\":\n", + " \"\"\"\n", + " Fill missing (NaN) values in specified columns safely.\n", + "\n", + " For each column provided in the mapping, this method replaces \n", + " NaN values with the specified fill value. Columns not present \n", + " in the DataFrame are skipped. If filling fails for a column \n", + " (e.g., due to incompatible data types), a warning is printed \n", + " and that column is left unchanged.\n", + "\n", + " Parameters\n", + " ----------\n", + " fill_map : dict\n", + " A dictionary mapping {column_name: fill_value} pairs.\n", + " Example: {\"age\": 0, \"city\": \"Unknown\"}\n", + "\n", + " Returns\n", + " -------\n", + " Self\n", + " The current instance, allowing method chaining.\n", + " \"\"\"\n", " for col, val in fill_map.items():\n", " try:\n", " if col in self.df.columns:\n", @@ -667,8 +732,31 @@ " print(f\"[Warning] Failed filling NaN for {col}: {e}\")\n", " return self\n", "\n", - " def safe_replace(self, col, replacements: dict):\n", - " \"\"\"Replace values in a column safely.\"\"\"\n", + " def replace_column_values(self, col: str, replacements: dict) -> \"Self\":\n", + " \"\"\"\n", + " Replace values in a specified DataFrame column using a mapping dictionary.\n", + "\n", + " This method attempts to apply the given replacements safely. \n", + " If the column exists, it replaces matching values based on the \n", + " provided mapping. If an error occurs during replacement \n", + " (e.g., invalid mapping or data type mismatch), a warning \n", + " is printed and the DataFrame is left unchanged.\n", + "\n", + " Parameters\n", + " ----------\n", + " col : str\n", + " The name of the column in the DataFrame to modify.\n", + " replacements : dict\n", + " A mapping of {old_value: new_value} pairs to replace.\n", + "\n", + " Returns\n", + " -------\n", + " Self\n", + " The current instance, allowing method chaining.\n", + " Sample usage:\n", + " >>> cleaner = DataCleaner(df)\n", + " >>> cleaner.replace_column_values(\"status\", {\"yes\": 1, \"no\": 0})\n", + " \"\"\"\n", " try:\n", " if col in self.df.columns:\n", " self.df[col] = self.df[col].replace(replacements)\n", @@ -676,7 +764,7 @@ " print(f\"[Warning] Failed replacing values in {col}: {e}\")\n", " return self\n", "\n", - " def safe_convert_dtype(self, col, dtype, errors=\"ignore\"):\n", + " def convert_datetime(self, col, dtype, errors=\"ignore\"):\n", " \"\"\"Convert column dtype safely.\"\"\"\n", " try:\n", " if col in self.df.columns:\n", @@ -689,8 +777,25 @@ " print(f\"[Warning] Failed dtype conversion on {col}: {e}\")\n", " return self\n", "\n", - " def normalize_gender(self):\n", - " \"\"\"Unify transgender categories safely.\"\"\"\n", + " def normalize_gender(self) -> \"Self\":\n", + " \"\"\"\n", + " Standardize gender labels in the DataFrame.\n", + "\n", + " This method looks for a column named \"Gender\" and replaces \n", + " specific transgender categories with the unified label \n", + " \"Transgender\". If the column does not exist or the replacement \n", + " fails (e.g., due to unexpected data types), the method prints a \n", + " warning and leaves the DataFrame unchanged.\n", + "\n", + " Replacements performed:\n", + " - \"Transgender male to female\" → \"Transgender\"\n", + " - \"Transgender female to male\" → \"Transgender\"\n", + "\n", + " Returns\n", + " -------\n", + " Self\n", + " The current instance, allowing method chaining.\n", + " \"\"\"\n", " try:\n", " if \"Gender\" in self.df.columns:\n", " self.df[\"Gender\"] = self.df[\"Gender\"].replace({\n", @@ -701,18 +806,35 @@ " print(f\"[Warning] Failed gender normalization: {e}\")\n", " return self\n", "\n", - " def split_race(self):\n", - " \"\"\"Split Race column into Race_1, Race_2, etc., if it exists.\"\"\"\n", + " def normalize_race(self) -> \"Self\":\n", + " \"\"\"\n", + " Normalize the 'Race' column so that multi-value entries are \n", + " collapsed into a single category \"Two or More Races\".\n", + "\n", + " Behavior\n", + " --------\n", + " - Single race values are kept as-is.\n", + " - Multi-value entries separated by \";\" or \",\" are replaced with\n", + " \"Two or More Races\".\n", + "\n", + " Example\n", + " -------\n", + " Original: \"White;Asian\" → \"Two or More Races\"\n", + " Original: \"White,Asian\" → \"Two or More Races\"\n", + "\n", + " Returns\n", + " -------\n", + " Self\n", + " The current instance, allowing method chaining.\n", + " \"\"\"\n", " try:\n", " if \"Race\" in self.df.columns:\n", - " splitting = self.df[\"Race\"].astype(\n", - " str).str.split(\";\", expand=True)\n", - " splitting.columns = [\n", - " f\"Race_{i+1}\" for i in range(splitting.shape[1])]\n", - " self.df = pd.concat(\n", - " [self.df.drop(columns=[\"Race\"]), splitting], axis=1)\n", + " self.df[\"Race\"] = self.df[\"Race\"].astype(str).apply(\n", + " lambda x: \"Two or More Races\" if (\n", + " \";\" in x or \",\" in x) else x\n", + " )\n", " except Exception as e:\n", - " print(f\"[Warning] Failed race splitting: {e}\")\n", + " print(f\"[Warning] Failed race normalization: {e}\")\n", " return self\n", "\n", " def clean_salary(self, hours_per_year: int = 2080):\n", @@ -720,13 +842,14 @@ " Clean and standardize salary values in the DataFrame.\n", "\n", " Steps performed:\n", - " 1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → \"50000\").\n", + " 1. Remove currency symbols, commas, and shorthand (e.g., \"$50k\" → 50000).\n", " 2. Handle ranges by converting them to the average value \n", - " (e.g., \"50,000-70,000\" → 60000).\n", - " 3. Convert values to numeric, coercing invalid entries to NaN.\n", - " 4. Treat values < 200 as hourly wages and convert to annual salaries \n", - " (multiplied by `hours_per_year`).\n", - " 5. Drop unrealistic values greater than 1,000,000 (set to NaN).\n", + " (e.g., \"50,000–70,000\" → 60000).\n", + " 3. Handle shorthand \"M\" (e.g., \"$1.5M\" → 1,500,000).\n", + " 4. Convert values to numeric, coercing invalid entries to NaN.\n", + " 5. Treat values <= 200 as hourly wages and convert to annual salaries \n", + " (multiplied by `hours_per_year`).\n", + " 6. Drop unrealistic values greater than 1,000,000 (set to NaN).\n", "\n", " Parameters\n", " ----------\n", @@ -741,12 +864,18 @@ " try:\n", " if \"Salary\" in self.df.columns:\n", " self.df[\"Salary\"] = self.df[\"Salary\"].astype(str)\n", + "\n", " def parse_salary(val: str):\n", " val = val.strip()\n", + " if not val or val.lower() in {\"nan\", \"none\"}:\n", + " return None\n", + "\n", + " # Normalize dash types (hyphen, en dash, em dash \"-\")\n", + " val = re.sub(r\"[–—]\", \"-\", val)\n", "\n", - " # Handle range like \"50k-70k\" or \"50,000–70,000\"\n", - " if \"-\" in val or \"–\" in val:\n", - " parts = re.split(r\"[-–]\", val)\n", + " # Handle range like \"50k-70k\" or \"50,000-70,000\"\n", + " if \"-\" in val:\n", + " parts = val.split(\"-\")\n", " nums = [parse_salary(p) for p in parts if p.strip()]\n", " nums = [n for n in nums if n is not None]\n", " return sum(nums) / len(nums) if nums else None\n", @@ -754,12 +883,17 @@ " # Remove $, commas, spaces\n", " val = re.sub(r\"[\\$,]\", \"\", val)\n", "\n", - " # Handle shorthand k/K (e.g., 50k -> 50000)\n", - " match = re.match(r\"(\\d+(\\.\\d+)?)([kK])\", val)\n", - " if match:\n", - " return float(match.group(1)) * 1000\n", + " # Handle shorthand k/K (e.g., \"50k\" → 50000)\n", + " match_k = re.match(r\"^(\\d+(\\.\\d+)?)[kK]$\", val)\n", + " if match_k:\n", + " return float(match_k.group(1)) * 1000\n", + "\n", + " # Handle shorthand M (e.g., \"1.5M\" → 1500000)\n", + " match_m = re.match(r\"^(\\d+(\\.\\d+)?)[mM]$\", val)\n", + " if match_m:\n", + " return float(match_m.group(1)) * 1_000_000\n", "\n", - " # Convert plain number if possible\n", + " # Plain number (integer or float)\n", " try:\n", " return float(val)\n", " except ValueError:\n", @@ -769,7 +903,8 @@ " self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n", "\n", " # Convert small numbers (hourly) to annual\n", - " self.df.loc[self.df[\"Salary\"] < 200, \"Salary\"] *= hours_per_year\n", + " self.df.loc[self.df[\"Salary\"] <=\n", + " 200, \"Salary\"] *= hours_per_year\n", "\n", " # Drop unrealistic salaries\n", " self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n", @@ -779,23 +914,226 @@ "\n", " return self\n", "\n", - " def finalize(self):\n", - " \"\"\"Return cleaned dataframe.\"\"\"\n", + " def finalize(self) -> pd.DataFrame:\n", + " \"\"\"\n", + " Finalize and return the cleaned DataFrame.\n", + "\n", + " This method should be called at the end of a cleaning pipeline \n", + " to retrieve the fully processed DataFrame after all applied \n", + " transformations.\n", + "\n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " The cleaned and transformed DataFrame.\n", + " \"\"\"\n", " return self.df" ] }, { "cell_type": "markdown", - + "id": "d84a5b95", + "metadata": {}, + "source": [ + "# example usage of each method\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "329da719", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/2d/yt4_w6zn5pbfjg_jx5sdmm180000gn/T/ipykernel_89024/677290603.py:130: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " self.df[col] = pd.to_datetime(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Auto IdGenderRaceEthnicity Hispanic/LatinoOutcomeVeteranEx-OffenderJustice InvolvedSingle ParentProgram: Program Name
0NaTMaleBlack or African AmericanUnknownUnknown0UnknownUnknownUnknownReimage 21-22
1NaTMaleBlack or African AmericanUnknownUnknown0UnknownUnknownUnknownReimage 21-22
2NaTMaleBlack or African AmericanUnknownUnknown0UnknownUnknownUnknownReimage 21-22
3NaTMaleAsianUnknownSuccessfully Completed0UnknownNoUnknownTech Louisville 21-22
4NaTMaleBlack or African AmericanUnknownUnknown-1UnknownUnknownUnknownTech Louisville 21-22
\n", + "
" + ], + "text/plain": [ + " Auto Id Gender Race Ethnicity Hispanic/Latino \\\n", + "0 NaT Male Black or African American Unknown \n", + "1 NaT Male Black or African American Unknown \n", + "2 NaT Male Black or African American Unknown \n", + "3 NaT Male Asian Unknown \n", + "4 NaT Male Black or African American Unknown \n", + "\n", + " Outcome Veteran Ex-Offender Justice Involved Single Parent \\\n", + "0 Unknown 0 Unknown Unknown Unknown \n", + "1 Unknown 0 Unknown Unknown Unknown \n", + "2 Unknown 0 Unknown Unknown Unknown \n", + "3 Successfully Completed 0 Unknown No Unknown \n", + "4 Unknown -1 Unknown Unknown Unknown \n", + "\n", + " Program: Program Name \n", + "0 Reimage 21-22 \n", + "1 Reimage 21-22 \n", + "2 Reimage 21-22 \n", + "3 Tech Louisville 21-22 \n", + "4 Tech Louisville 21-22 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cleaner = DataCleaner(all_demo)\n", + "\n", + "clean_df = (\n", + " cleaner\n", + " # 1. Drop unneeded columns\n", + " .drop_columns([\"First Name\", \"Last Name\"])\n", + "\n", + " # 2. Fill missing values\n", + " .fillna({\n", + " \"Outcome\": \"Unknown\",\n", + " \"Veteran\": \"Unknown\",\n", + " \"Ex-Offender\": \"Unknown\",\n", + " \"Justice Involved\": \"Unknown\",\n", + " \"Single Parent\": \"Unknown\",\n", + " \"Ethnicity Hispanic/Latino\": \"Unknown\"\n", + " })\n", + "\n", + " # 3. Replace specific column values\n", + " .replace_column_values(\"Veteran\", {\"No\": 0, \"Yes\": 1, \"Unknown\": -1})\n", + "\n", + " # 4. Convert a column to datetime (pretend Auto Id is a date code)\n", + " .convert_datetime(\"Auto Id\", \"datetime64[ns]\") # will fail gracefully\n", + "\n", + " # 5. Normalize gender labels\n", + " .normalize_gender()\n", + "\n", + " # 6. Normalize race column (collapse multi-value)\n", + " .normalize_race()\n", + "\n", + " # 7. Clean salary column\n", + " .clean_salary()\n", + "\n", + " # 8. Finalize and return cleaned DataFrame\n", + " .finalize()\n", + ")\n", + "\n", + "clean_df.head()" + ] + }, + { + "cell_type": "markdown", "id": "3eb6373f", "metadata": {}, "source": [ - "### Sample use of the clean_salary function. " + "### Sample use of the clean_salary function.\n" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "id": "182eac4a", "metadata": {}, "outputs": [ @@ -811,7 +1149,7 @@ "4 75000.0\n", "5 100000.0\n", "6 150000.0\n", - "7 200.0\n", + "7 416000.0\n", "8 3000.0\n", "9 NaN\n", "10 NaN\n", @@ -838,7 +1176,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "id": "82806fc9", "metadata": {}, "outputs": [ @@ -856,7 +1194,7 @@ "6 NaN\n", "7 NaN\n", "8 NaN\n", - "9 NaN\n" + "9 75000.0\n" ] } ], @@ -961,7 +1299,8 @@ " self.df[\"Salary\"] = self.df[\"Salary\"].apply(parse_salary)\n", "\n", " # Convert small numbers (hourly) to annual\n", - " self.df.loc[self.df[\"Salary\"] <= 200, \"Salary\"] *= hours_per_year\n", + " self.df.loc[self.df[\"Salary\"] <=\n", + " 200, \"Salary\"] *= hours_per_year\n", "\n", " # Drop unrealistic salaries\n", " self.df.loc[self.df[\"Salary\"] > 1_000_000, \"Salary\"] = None\n", @@ -973,12 +1312,12 @@ "\n", " def finalize(self):\n", " \"\"\"Return cleaned dataframe.\"\"\"\n", - " return self.df\n" + " return self.df" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "688bdf74", "metadata": {}, "outputs": [ @@ -994,16 +1333,16 @@ "# Test DataFrame with edge/fail cases\n", "fail_df = pd.DataFrame({\n", " \"Salary\": [\n", - " None, # NaN\n", - " \"\", # NaN\n", - " \" \", # NaN\n", - " \"abc123\", # NaN\n", - " \"50k-abc\", # 50000.0\n", - " \"$-5000\", # -5000.0 (still allowed for now)\n", - " \"∞\", # NaN\n", - " \"NaN\", # NaN\n", - " \"$1.5M\", # NaN ( >1,000,000 rule)\n", - " \"70,000—80,000\" # 75000.0 (dash normalized)\n", + " None, # NaN\n", + " \"\", # NaN\n", + " \" \", # NaN\n", + " \"abc123\", # NaN\n", + " \"50k-abc\", # 50000.0\n", + " \"$-5000\", # -5000.0 (still allowed for now)\n", + " \"∞\", # NaN\n", + " \"NaN\", # NaN\n", + " \"$1.5M\", # NaN ( >1,000,000 rule)\n", + " \"70,000—80,000\" # 75000.0 (dash normalized)\n", " ]\n", "})\n", "\n", @@ -1037,15 +1376,15 @@ "id": "6ddbb4c0", "metadata": {}, "source": [ - "## Generate report \n", + "## Generate report\n", "\n", "- Overall completion of program only accounting for the new style of classes m1-m4\n", - "- completion by year \n", - "- completion over all by pathway \n", - "- completion by year by pathway \n", - "- Feel free to get creative here adding gender etc to get us a better understanding \n", - "- education level and the above... \n", - "- export this as a txt file " + "- completion by year\n", + "- completion over all by pathway\n", + "- completion by year by pathway\n", + "- Feel free to get creative here adding gender etc to get us a better understanding\n", + "- education level and the above...\n", + "- export this as a txt file\n" ] }, { @@ -1061,15 +1400,16 @@ "id": "859cf674", "metadata": {}, "source": [ - "## Plots \n", - "- Look at the various plots \n", + "## Plots\n", + "\n", + "- Look at the various plots\n", "- make a consistent color scheme\n", - "- pick the plots that go with the report above \n", - "- make missing plots \n", + "- pick the plots that go with the report above\n", + "- make missing plots\n", "- make plots have the option to show & save in the functions\n", "\n", "see `src/notebooks/visualization_examples.ipynb`\n", - "See below from `src/Carmen_WORCEmployment_Plots.py`" + "See below from `src/Carmen_WORCEmployment_Plots.py`\n" ] }, { @@ -1088,13 +1428,15 @@ "\n", "def plot_avg_salary_by_city(data):\n", " region_salary = data.groupby('Mailing City')['Salary'].mean().sort_values()\n", - " region_salary.plot(kind='barh', figsize=(8, 5), title=\"Average Salary by KY Region\")\n", + " region_salary.plot(kind='barh', figsize=(\n", + " 8, 5), title=\"Average Salary by KY Region\")\n", " plt.xlabel(\"Average Salary\")\n", " plt.show()\n", "\n", "\n", "def plot_placements_over_time(data):\n", - " data.set_index('Start Date').resample('M').size().plot(kind='line', marker='o', figsize=(10, 4))\n", + " data.set_index('Start Date').resample('M').size().plot(\n", + " kind='line', marker='o', figsize=(10, 4))\n", " plt.title(\"Number of Placements Over Time\")\n", " plt.ylabel(\"Placements\")\n", " plt.show()\n", @@ -1102,7 +1444,8 @@ "\n", "def plot_placement_type_by_program(data):\n", " plt.figure(figsize=(10, 6))\n", - " sns.countplot(data=data, x='ATP Placement Type', hue='Program: Program Name')\n", + " sns.countplot(data=data, x='ATP Placement Type',\n", + " hue='Program: Program Name')\n", " plt.xticks(rotation=45)\n", " plt.title(\"Placement Type by Program\")\n", " plt.show()\n", @@ -1110,7 +1453,8 @@ "\n", "def plot_top_cities(data):\n", " city_counts = data['Mailing City'].value_counts().head(10)\n", - " city_counts.plot(kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))\n", + " city_counts.plot(\n", + " kind='bar', title='Top Cities by Participant Count', figsize=(8, 4))\n", " plt.ylabel(\"Count\")\n", " plt.show()" ] @@ -1120,12 +1464,12 @@ "id": "f905708f", "metadata": {}, "source": [ - "TOC generator " + "TOC generator\n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d4fc7116", "metadata": {}, "outputs": [ @@ -1182,13 +1526,13 @@ "\n", "\n", "notebook_path = 'mainNb.ipynb'\n", - "generate_toc_from_notebook(notebook_path)\n" + "generate_toc_from_notebook(notebook_path)" ] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "venv (3.12.2)", "language": "python", "name": "python3" }, @@ -1202,7 +1546,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.1" + "version": "3.12.2" } }, "nbformat": 4, From 27ec9463f45bfb47558428bbee36c6ab355abc2f Mon Sep 17 00:00:00 2001 From: dmorton714 Date: Thu, 21 Aug 2025 10:43:24 -0400 Subject: [PATCH 3/3] added doc strings --- src/notebooks/mainNb.ipynb | 109 +++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 4 deletions(-) diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb index 8e0cbfb..3d7669e 100644 --- a/src/notebooks/mainNb.ipynb +++ b/src/notebooks/mainNb.ipynb @@ -7,8 +7,6 @@ "source": [ "### **Table of Contents**\n", "\n", - " * [**Table of Contents**](#**table-of-contents**)\n", - "\n", "- [Function To Read in the Data!](#function-to-read-in-the-data!)\n", "- [Example usage](#example-usage)\n", " - [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n", @@ -44,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "7cd30f44", "metadata": {}, "outputs": [], @@ -610,6 +608,76 @@ "All_demographics_and_programs" ] }, + { + "cell_type": "markdown", + "id": "b83d9a39", + "metadata": {}, + "source": [ + "Should we switch to this rather than the 2 step process above?\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a009686", + "metadata": {}, + "outputs": [], + "source": [ + "def load_data_folder(\n", + " folder_path: Union[str, os.PathLike] = \"../../data\",\n", + " safe_names: bool = False\n", + ") -> Dict[str, pd.DataFrame]:\n", + " \"\"\"\n", + " Load all CSV/XLS/XLSX files in a folder into pandas DataFrames.\n", + " ...\n", + " safe_names : bool, optional\n", + " If True, replace spaces in filenames with underscores for dict keys.\n", + " \"\"\"\n", + " path = Path(folder_path)\n", + " if not path.exists():\n", + " raise FileNotFoundError(f\"Folder not found: {path.resolve()}\")\n", + "\n", + " dataframes: Dict[str, pd.DataFrame] = {}\n", + " for p in path.iterdir():\n", + " if not p.is_file():\n", + " continue\n", + "\n", + " ext = p.suffix.lower()\n", + " if ext == \".csv\":\n", + " df = pd.read_csv(p)\n", + " elif ext in {\".xlsx\", \".xls\"}:\n", + " df = pd.read_excel(p)\n", + " else:\n", + " continue\n", + "\n", + " key = p.stem.replace(\" \", \"_\") if safe_names else p.stem\n", + " dataframes[key] = df\n", + "\n", + " return dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce7ffc41", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfs = load_data_folder(safe_names=True)\n", + "dfs.keys()" + ] + }, { "cell_type": "markdown", "id": "fe6f5506", @@ -765,7 +833,40 @@ " return self\n", "\n", " def convert_datetime(self, col, dtype, errors=\"ignore\"):\n", - " \"\"\"Convert column dtype safely.\"\"\"\n", + " \"\"\"\n", + " Convert a column to a specified dtype, with special handling for datetimes.\n", + "\n", + " Parameters\n", + " ----------\n", + " col : str\n", + " Name of the column to convert.\n", + " dtype : str or type\n", + " Target dtype. If the string contains \"datetime\", the method will use\n", + " `pandas.to_datetime` for conversion. Otherwise, it uses `.astype()`.\n", + " errors : {\"ignore\", \"raise\", \"coerce\"}, default \"ignore\"\n", + " Error handling behavior:\n", + " - \"ignore\": invalid parsing will return the original input.\n", + " - \"raise\": raises an exception on invalid parsing.\n", + " - \"coerce\": invalid parsing will be set as NaT (for datetime) or NaN.\n", + "\n", + " Returns\n", + " -------\n", + " self : DataFrameCleaner\n", + " The instance with the modified DataFrame, allowing for method chaining.\n", + "\n", + " Notes\n", + " -----\n", + " - For datetime conversion, the method forces `errors=\"coerce\"` to ensure\n", + " invalid values are converted to NaT instead of raising.\n", + " - For non-datetime conversions, the provided `errors` argument is passed\n", + " directly to `.astype()`.\n", + " - If the column does not exist, no action is taken.\n", + "\n", + " Examples\n", + " --------\n", + " >>> cleaner.convert_datetime(\"StartDate\", \"datetime64[ns]\")\n", + " >>> cleaner.convert_datetime(\"Age\", \"int\", errors=\"coerce\")\n", + " \"\"\"\n", " try:\n", " if col in self.df.columns:\n", " if \"datetime\" in str(dtype):\n",