From af78425b668a2414ddf41219f1f49694ed1ffa04 Mon Sep 17 00:00:00 2001 From: "John R. McGarvey" Date: Thu, 20 Mar 2025 22:46:59 -0400 Subject: [PATCH 1/9] create folders for assignment 9 and 10 homework --- assignment10/.keep | 0 assignment9/.keep | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 assignment10/.keep create mode 100644 assignment9/.keep diff --git a/assignment10/.keep b/assignment10/.keep new file mode 100644 index 0000000..e69de29 diff --git a/assignment9/.keep b/assignment9/.keep new file mode 100644 index 0000000..e69de29 From f00a64f7a8f6912c59577b920da57a005dcd1e10 Mon Sep 17 00:00:00 2001 From: "John R. McGarvey" Date: Thu, 20 Mar 2025 23:20:49 -0400 Subject: [PATCH 2/9] added folders for assignments 7 and 8 as well --- assignment7/.keep | 0 assignment8/.keep | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 assignment7/.keep create mode 100644 assignment8/.keep diff --git a/assignment7/.keep b/assignment7/.keep new file mode 100644 index 0000000..e69de29 diff --git a/assignment8/.keep b/assignment8/.keep new file mode 100644 index 0000000..e69de29 From 6583442e13d59fb41918c5b9719830ddd9c2c6cc Mon Sep 17 00:00:00 2001 From: Tom Arns Date: Fri, 21 Mar 2025 08:45:17 -0700 Subject: [PATCH 3/9] fixed test to match problem description - case error --- assignment3/assignment3-test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment3/assignment3-test.py b/assignment3/assignment3-test.py index d1ac1af..b0a14ca 100644 --- a/assignment3/assignment3-test.py +++ b/assignment3/assignment3-test.py @@ -3,7 +3,7 @@ import pandas as pd import os -test1_df = pd.DataFrame({ 'Name': ['Alice', 'Bob', 'charlie'], +test1_df = pd.DataFrame({ 'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35], 'City': ['New York', 'Los Angeles', 'Chicago']}) From 1a29712e95f467e127cac9683359682a0c0e0ac4 Mon Sep 17 00:00:00 2001 From: "John R. McGarvey" Date: Mon, 31 Mar 2025 00:14:05 -0400 Subject: [PATCH 4/9] add the assignment12 folder --- assignment12/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 assignment12/.keep diff --git a/assignment12/.keep b/assignment12/.keep new file mode 100644 index 0000000..e69de29 From 197ea81b7c2ef08e721b476bb7b7e8e7882876c9 Mon Sep 17 00:00:00 2001 From: Rick Martin Date: Wed, 2 Apr 2025 09:48:21 -0400 Subject: [PATCH 5/9] Added test for NaTs in Hire Date field --- assignment3/assignment3-test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/assignment3/assignment3-test.py b/assignment3/assignment3-test.py index b0a14ca..a0576a0 100644 --- a/assignment3/assignment3-test.py +++ b/assignment3/assignment3-test.py @@ -104,9 +104,10 @@ def test_department_uppercase(): all_upper = False assert all_upper - - - +# If dates are not converted properly with form="mixed" will end up with NaTs +def test_hire_date_notNAT(): + nat_count = a3.clean_data['Hire Date'].isna().sum() + assert nat_count == 0 From a938922cda81e2222faa3dfebd7ffc86d1eee0dc Mon Sep 17 00:00:00 2001 From: Rick Martin Date: Wed, 2 Apr 2025 09:54:41 -0400 Subject: [PATCH 6/9] Added test for NaTs in Hire Date field again --- assignment3/assignment3-test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment3/assignment3-test.py b/assignment3/assignment3-test.py index a0576a0..5843753 100644 --- a/assignment3/assignment3-test.py +++ b/assignment3/assignment3-test.py @@ -104,7 +104,7 @@ def test_department_uppercase(): all_upper = False assert all_upper -# If dates are not converted properly with form="mixed" will end up with NaTs +# April 2: If dates are not converted properly with form="mixed" will end up with NaTs def test_hire_date_notNAT(): nat_count = a3.clean_data['Hire Date'].isna().sum() assert nat_count == 0 From 35b7faa6c2b88c99caa716b15d1f5b954a8460f1 Mon Sep 17 00:00:00 2001 From: Hicham Lamnaouar Date: Sun, 6 Apr 2025 09:22:00 -0500 Subject: [PATCH 7/9] Saving changes in assignments directory --- assignment2/diary.py | 199 ++++++++++++++++++++++++++++++++++++++++++ assignment2/diary.txt | 45 ++++++++++ 2 files changed, 244 insertions(+) create mode 100644 assignment2/diary.py create mode 100644 assignment2/diary.txt diff --git a/assignment2/diary.py b/assignment2/diary.py new file mode 100644 index 0000000..998b536 --- /dev/null +++ b/assignment2/diary.py @@ -0,0 +1,199 @@ +# Task 1: Diary +import traceback + +try: + with open("diary.txt", "a") as file: # Open file in append mode + first_entry = True # Flag to check if it's the first input + + while True: + # If it is the first input, prompt for "What happened today?" + if first_entry: + entry = input("What happened today? ") + first_entry = False # Change the flag to False after the first input + else: + entry = input("What else? ") + + # Check if the user enters 'done for now' + if entry.lower() == "done for now": + file.write(entry + "\n") # Write "done for now" to the file + break # Exit the loop after writing "done for now" + + # Write the user's entry to the file if it's not 'done for now' + file.write(entry + "\n") + +except Exception as e: + # Handle exceptions and print the traceback + trace_back = traceback.extract_tb(e.__traceback__) + stack_trace = [] + for trace in trace_back: + stack_trace.append(f'File : {trace[0]} , Line : {trace[1]}, Func.Name : {trace[2]}, Message : {trace[3]}') + print(f"Exception type: {type(e).__name__}") + message = str(e) + if message: + print(f"Exception message: {message}") + print(f"Stack trace: {stack_trace}") + + + +#Task 2: Read a CSV File +import csv +def read_employees(): + employees = {} # To store data + rows = [] # To store employee rows + try: + with open('../csv/employees.csv', mode='r') as file: + csv_reader = csv.reader(file) + fields = next(csv_reader) # Get the first row for field names + employees['fields'] = fields + for row in csv_reader: + rows.append(row) + employees['rows'] = rows + except Exception as e: + print("An exception occurred: ", e) + return employees + +# Test +employees = read_employees() +print(employees) + +#Task 3: Find the Column Index +def column_index(field_name): + return employees["fields"].index(field_name) + +# Test +employee_id_column = column_index("employee_id") +print(employee_id_column) + +#Task 4: Find the Employee First Name +def first_name(row_num): + column_idx = column_index("first_name") + return employees["rows"][row_num][column_idx] + +# Test +print(first_name(0)) # Get first name of the first employee + +#Task 5: Find the Employee:a Function in a Function +def employee_find(employee_id): + def employee_match(row): + return int(row[employee_id_column]) == employee_id + + matches = list(filter(employee_match, employees["rows"])) + return matches + +# Test +print(employee_find(1001)) + +#Task 6: Find the Employee with a Lambda +def employee_find_2(employee_id): + matches = list(filter(lambda row: int(row[employee_id_column]) == employee_id, employees["rows"])) + return matches + +# Test +print(employee_find_2(1001)) + +#Task 7: Sort the Rows by last_name Using a Lambda +def sort_by_last_name(): + last_name_column = column_index("last_name") + employees["rows"].sort(key=lambda row: row[last_name_column]) + return employees["rows"] + +# Test +sorted_employees = sort_by_last_name() +print(sorted_employees) + +#Task 8: Create a dict for an Employee +def employee_dict(row): + return {employees["fields"][i]: row[i] for i in range(len(row)) if employees["fields"][i] != "employee_id"} + +# Test +print(employee_dict(employees["rows"][0])) # Test for the first row + +#Task 9: A dict of dicts, for All Employees +def all_employees_dict(): + return {row[employee_id_column]: employee_dict(row) for row in employees["rows"]} + +# Test +print(all_employees_dict()) + +#Task 10: Use the os Module +# custom_module.py +secret = "shazam!" +def set_secret(new_secret): + global secret + secret = new_secret + +#Task 11: Creating Your Own Module + #1. Create custom_module.py: + # custom_module.py +secret = "shazam!" + +def set_secret(new_secret): + global secret + secret = new_secret + #2. In your main program: +import custom_module + +def set_that_secret(new_secret): + custom_module.set_secret(new_secret) + +# Test +set_that_secret("new_secret_value") +print(custom_module.secret) + +#Task 12: Read minutes1.csv and minutes2.csv +def read_minutes(): + def read_file(file_name): + minutes = {"fields": [], "rows": []} + try: + with open(file_name, mode='r') as file: + csv_reader = csv.reader(file) + minutes["fields"] = next(csv_reader) + for row in csv_reader: + minutes["rows"].append(tuple(row)) # Convert rows to tuple + except Exception as e: + print("An exception occurred: ", e) + return minutes + + minutes1 = read_file("../csv/minutes1.csv") + minutes2 = read_file("../csv/minutes2.csv") + return minutes1, minutes2 + +# Test +minutes1, minutes2 = read_minutes() +print(minutes1) +print(minutes2) + +#Task 13: Create minutes_set +def create_minutes_set(): + minutes1_set = set(minutes1["rows"]) + minutes2_set = set(minutes2["rows"]) + return minutes1_set.union(minutes2_set) + +# Test +minutes_set = create_minutes_set() +print(minutes_set) + +#Task 14: Convert to datetime +from datetime import datetime + +def create_minutes_list(): + minutes_list = list(minutes_set) + return list(map(lambda x: (x[0], datetime.strptime(x[1], "%B %d, %Y")), minutes_list)) + +# Test +minutes_list = create_minutes_list() +print(minutes_list) + +#Task 15: Write Out Sorted List +def write_sorted_list(): + minutes_list.sort(key=lambda x: x[1]) # Sort by datetime + with open("./minutes.csv", mode='w', newline='') as file: + csv_writer = csv.writer(file) + csv_writer.writerow(minutes1["fields"]) + for row in minutes_list: + csv_writer.writerow([row[0], row[1].strftime("%B %d, %Y")]) + return minutes_list + +# Test +sorted_minutes = write_sorted_list() +print(sorted_minutes) diff --git a/assignment2/diary.txt b/assignment2/diary.txt new file mode 100644 index 0000000..8066252 --- /dev/null +++ b/assignment2/diary.txt @@ -0,0 +1,45 @@ + + + + + + +done for now + +done for now +done for now +done for now +done for now +exit +done for now +done for now +done for now +"done for now" + + + + + + + + +done for now + +I went to school +meet friends +done for now +done for now +done for now +done for now +I went to dinner +meet some friends +i went to gym +done for now +done for now +done for now +went to gym +meet some friends +went to dinner +done for now +done for now +done for now From 4e3c6a80f4c869ab46a318cdb2344138e6b1bbd0 Mon Sep 17 00:00:00 2001 From: Hicham Lamnaouar Date: Sun, 6 Apr 2025 09:29:01 -0500 Subject: [PATCH 8/9] Saving changes 2 in assignments directory --- assignment10/.keep | 0 assignment12/.keep | 0 assignment7/.keep | 0 assignment8/.keep | 0 assignment9/.keep | 0 5 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 assignment10/.keep delete mode 100644 assignment12/.keep delete mode 100644 assignment7/.keep delete mode 100644 assignment8/.keep delete mode 100644 assignment9/.keep diff --git a/assignment10/.keep b/assignment10/.keep deleted file mode 100644 index e69de29..0000000 diff --git a/assignment12/.keep b/assignment12/.keep deleted file mode 100644 index e69de29..0000000 diff --git a/assignment7/.keep b/assignment7/.keep deleted file mode 100644 index e69de29..0000000 diff --git a/assignment8/.keep b/assignment8/.keep deleted file mode 100644 index e69de29..0000000 diff --git a/assignment9/.keep b/assignment9/.keep deleted file mode 100644 index e69de29..0000000 From 6c5bf5cbaa64198f710b848d6266b9349d9bdb9b Mon Sep 17 00:00:00 2001 From: Hicham Lamnaouar Date: Sun, 6 Apr 2025 15:39:34 -0500 Subject: [PATCH 9/9] =?UTF-8?q?=20Lesson3=20-=20Lesson=203=20Assignment=20?= =?UTF-8?q?=E2=80=94=20Intro=20to=20Data=20Engineering?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- assignment3/additional_employees.json | 4 ++ assignment3/assignment3.py | 76 +++++++++++++++++++++++++++ assignment3/employees.csv | 4 ++ 3 files changed, 84 insertions(+) create mode 100644 assignment3/additional_employees.json create mode 100644 assignment3/employees.csv diff --git a/assignment3/additional_employees.json b/assignment3/additional_employees.json new file mode 100644 index 0000000..872c025 --- /dev/null +++ b/assignment3/additional_employees.json @@ -0,0 +1,4 @@ +[ + {"Name": "Eve", "Age": 28, "City": "Miami", "Salary": 60000}, + {"Name": "Frank", "Age": 40, "City": "Seattle", "Salary": 95000} +] \ No newline at end of file diff --git a/assignment3/assignment3.py b/assignment3/assignment3.py index e69de29..c760515 100644 --- a/assignment3/assignment3.py +++ b/assignment3/assignment3.py @@ -0,0 +1,76 @@ +# Task 1: Introduction to Pandas - Creating and Manipulating DataFrames + #1. Create a DataFrame from a dictionary: +import pandas as pd + +# Create dictionary +data = { + 'Name': ['Alice', 'Bob', 'Charlie'], + 'Age': [25, 30, 35], + 'City': ['New York', 'Los Angeles', 'Chicago'] +} +# Convert to DataFrame +task1_data_frame = pd.DataFrame(data) +print(task1_data_frame) + #2. Add a new column: +task1_with_salary = task1_data_frame.copy() +task1_with_salary['Salary'] = [70000, 80000, 90000] +print(task1_with_salary) + #3. Modify an existing column: +task1_older = task1_with_salary.copy() +task1_older['Age'] = task1_older['Age'] + 1 +print(task1_older) + #4. Save the DataFrame as a CSV file: +task1_older.to_csv('employees.csv', index=False) + +#Task 2: Loading Data from CSV and JSON + #1. Read data from a CSV file: +task2_employees = pd.read_csv('employees.csv') +print(task2_employees) + #2. Read data from a JSON file: +json_employees = pd.read_json('additional_employees.json') +print(json_employees) +more_employees = pd.concat([task2_employees, json_employees], ignore_index=True) +print(more_employees) + +#Task 3: Data Inspection - Using Head, Tail, and Info Methods + #Using the head() method: +first_three = more_employees.head(3) +print(first_three) + # Using the tail() method: +last_two = more_employees.tail(2) +print(last_two) + #Get the shape of a DataFrame +employee_shape = more_employees.shape +print(employee_shape) + #Use the info() method: +more_employees.info() + +# Task 4: Data Cleaning + # Create a DataFrame from dirty_data.csv file and assign it to the variable dirty_data. +dirty_data = pd.read_csv('dirty_data.csv') +print(dirty_data) +clean_data = dirty_data.copy() + #Remove any duplicate rows from the DataFrame +clean_data = clean_data.drop_duplicates() +print(clean_data) + # Convert Age to numeric and handle missing values +clean_data['Age'] = pd.to_numeric(clean_data['Age'], errors='coerce') +print(clean_data) + # Convert Salary to numeric and replace known placeholders (unknown, n/a) with NaN +clean_data['Salary'] = clean_data['Salary'].replace(['unknown', 'n/a'], pd.NA) +clean_data['Salary'] = pd.to_numeric(clean_data['Salary'], errors='coerce') +print(clean_data) + # Fill missing numeric values (use fillna). Fill Age which the mean and Salary with the median +mean_age = clean_data['Age'].mean() +median_salary = clean_data['Salary'].median() + +clean_data['Age'] = clean_data['Age'].fillna(mean_age) +clean_data['Salary'] = clean_data['Salary'].fillna(median_salary) +print(clean_data) + #Convert Hire Date to datetime +clean_data['Hire Date'] = pd.to_datetime(clean_data['Hire Date'], errors='coerce') +print(clean_data) + # Strip extra whitespace and standardize Name and Department as uppercase +clean_data['Name'] = clean_data['Name'].str.strip().str.upper() +clean_data['Department'] = clean_data['Department'].str.strip().str.upper() +print(clean_data) \ No newline at end of file diff --git a/assignment3/employees.csv b/assignment3/employees.csv new file mode 100644 index 0000000..2bd2f60 --- /dev/null +++ b/assignment3/employees.csv @@ -0,0 +1,4 @@ +Name,Age,City,Salary +Alice,26,New York,70000 +Bob,31,Los Angeles,80000 +Charlie,36,Chicago,90000