diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..cb393e1 Binary files /dev/null and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..1214d9a Binary files /dev/null and b/q01_missing_value/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/build.cpython-36.pyc b/q01_missing_value/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..5d435d5 Binary files /dev/null and b/q01_missing_value/__pycache__/build.cpython-36.pyc differ diff --git a/q01_missing_value/build.py b/q01_missing_value/build.py index 7dc4d18..eb4e2da 100644 --- a/q01_missing_value/build.py +++ b/q01_missing_value/build.py @@ -1,5 +1,8 @@ +# %load q01_missing_value/build.py # Default imports import pandas as pd +import numpy as np +from sklearn.preprocessing import Imputer # Data loading ny_housing = pd.read_csv('data/train.csv') @@ -7,4 +10,31 @@ housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] -# Write your code here: +# This function will impute numercal and categorical data +def imputation(housing_data): + ''' Function to impute numerical data with mean + impute categorical data with mode ''' + #Seperate numercial and categorical columns + num_col = housing_data._get_numeric_data().columns + cat_col = housing_data.columns.difference(num_col) + + #impute numerical data + imput_num = Imputer(missing_values=np.NaN , strategy= 'mean') + df_num = imput_num.fit_transform(housing_data.loc[: , num_col]) + df_num = pd.DataFrame(df_num) + df_num.columns = num_col + + #impute categorial data + for col in cat_col: + housing_data.loc[: , col].fillna(housing_data.loc[:,col].mode()[0] , inplace = True) + df_cat = housing_data.loc[:,cat_col] + + return df_num,df_cat + +imputation(housing_data) + + + + + + diff --git a/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..c22e846 Binary files /dev/null and b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc new file mode 100644 index 0000000..cc0d59c Binary files /dev/null and b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..c0fa46b Binary files /dev/null and b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/build.cpython-36.pyc b/q02_outlier_removal/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..6d64e44 Binary files /dev/null and b/q02_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q02_outlier_removal/build.py b/q02_outlier_removal/build.py index 74df5f2..2ed2651 100644 --- a/q02_outlier_removal/build.py +++ b/q02_outlier_removal/build.py @@ -1,5 +1,12 @@ +# %load q02_outlier_removal/build.py # Default imports import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from scipy import stats +import warnings +warnings.filterwarnings('ignore') # Data ny_housing = pd.read_csv('data/train.csv') @@ -8,3 +15,13 @@ # Write your code here: +def outlier_removal(housing_data): + num_col = housing_data._get_numeric_data().columns + for col in num_col: + housing_data = housing_data.drop(housing_data[housing_data[col] > housing_data[col].quantile(0.95)].index) + return housing_data + +outlier_removal(housing_data) + + + diff --git a/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..4146151 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc new file mode 100644 index 0000000..b8374f6 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..3964ffd Binary files /dev/null and b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/build.cpython-36.pyc b/q03_skewness_log/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..9212a31 Binary files /dev/null and b/q03_skewness_log/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_log/build.py b/q03_skewness_log/build.py index f008d0f..e70e487 100644 --- a/q03_skewness_log/build.py +++ b/q03_skewness_log/build.py @@ -1,3 +1,4 @@ +# %load q03_skewness_log/build.py from scipy.stats import skew import pandas as pd import numpy as np @@ -5,4 +6,11 @@ data = pd.read_csv('data/train.csv') -# Write code here: +# Calculate skewness +def skewness_log(data): + return skew(np.log(data['GrLivArea'])), skew(np.log(data['SalePrice'])) + +skewness_log(data) + + + diff --git a/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..ea507ec Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc new file mode 100644 index 0000000..9acb718 Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..4e2f0dc Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..4e93905 Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/build.py b/q03_skewness_sqrt/build.py index 4bdb0e4..0c5d6de 100644 --- a/q03_skewness_sqrt/build.py +++ b/q03_skewness_sqrt/build.py @@ -1,3 +1,4 @@ +# %load q03_skewness_sqrt/build.py # Default imports from scipy.stats import skew import pandas as pd @@ -6,5 +7,16 @@ ny_housing = pd.read_csv('data/train.csv') -# Write your Solution Here: +# function to reduce skweness of variables +def skewness_sqrt(ny_housing): + skewed_sqrt_val2 = skew(np.sqrt(ny_housing['SalePrice'])) + skewed_sqrt_val1 = skew(np.sqrt(ny_housing['GrLivArea'])) + return skewed_sqrt_val1,skewed_sqrt_val2 + +skewness_sqrt(ny_housing) + + + + + diff --git a/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e7d92e2 Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc new file mode 100644 index 0000000..ecf93e0 Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/__init__.cpython-36.pyc b/q04_encoding/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..5aab52f Binary files /dev/null and b/q04_encoding/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/build.cpython-36.pyc b/q04_encoding/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..a35cea8 Binary files /dev/null and b/q04_encoding/__pycache__/build.cpython-36.pyc differ diff --git a/q04_encoding/build.py b/q04_encoding/build.py index a52c57f..fd22c20 100644 --- a/q04_encoding/build.py +++ b/q04_encoding/build.py @@ -1,10 +1,23 @@ +# %load q04_encoding/build.py # Default imports import pandas as pd from sklearn.preprocessing import LabelEncoder ny_housing = pd.read_csv('data/train.csv') -housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] # Write your code here: +def encoding(ny_housing): + housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] + ny_housing['GarageType'].fillna('Not Available', inplace = True) + + lbl = LabelEncoder() + ny_housing['LotShape'] = lbl.fit_transform(ny_housing['LotShape']) + return ny_housing + +encoding(ny_housing) + + + + diff --git a/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..56e5423 Binary files /dev/null and b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc new file mode 100644 index 0000000..b9dc7a5 Binary files /dev/null and b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc differ