diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..2bb8a9b Binary files /dev/null and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..8a470e8 Binary files /dev/null and b/q01_missing_value/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/build.cpython-36.pyc b/q01_missing_value/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..0863f39 Binary files /dev/null and b/q01_missing_value/__pycache__/build.cpython-36.pyc differ diff --git a/q01_missing_value/build.py b/q01_missing_value/build.py index 7dc4d18..a2fdec4 100644 --- a/q01_missing_value/build.py +++ b/q01_missing_value/build.py @@ -1,5 +1,9 @@ +# %load q01_missing_value/build.py # Default imports import pandas as pd +import numpy as np +from sklearn.preprocessing import Imputer + # Data loading ny_housing = pd.read_csv('data/train.csv') @@ -8,3 +12,27 @@ # Write your code here: +def imputation(housing_data): + ''' Function to impute numerical data with mean + impute categorical data with mode ''' + #Seperate numercial and categorical columns + num_col = housing_data._get_numeric_data().columns + cat_col = housing_data.columns.difference(num_col) + + #impute numerical data + imput_num = Imputer(missing_values=np.NaN , strategy= 'mean') + df_num = imput_num.fit_transform(housing_data.loc[: , num_col]) + df_num = pd.DataFrame(df_num) + df_num.columns = num_col + + #impute categorial data + for col in cat_col: + housing_data.loc[: , col].fillna(housing_data.loc[:,col].mode()[0] , inplace = True) + df_cat = housing_data.loc[:,cat_col] + + return df_num,df_cat + +imputation(housing_data) + + + diff --git a/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..2bc2af1 Binary files /dev/null and b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc new file mode 100644 index 0000000..2e29d9f Binary files /dev/null and b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..5f2a3e9 Binary files /dev/null and b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/build.cpython-36.pyc b/q02_outlier_removal/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..b08ffd4 Binary files /dev/null and b/q02_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q02_outlier_removal/build.py b/q02_outlier_removal/build.py index 74df5f2..36dd3f1 100644 --- a/q02_outlier_removal/build.py +++ b/q02_outlier_removal/build.py @@ -1,5 +1,14 @@ +# %load q02_outlier_removal/build.py # Default imports + import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from scipy import stats +import warnings +warnings.filterwarnings('ignore') + # Data ny_housing = pd.read_csv('data/train.csv') @@ -8,3 +17,13 @@ # Write your code here: +def outlier_removal(housing_data): + num_col = housing_data._get_numeric_data().columns + for col in num_col: + housing_data = housing_data.drop(housing_data[housing_data[col] > housing_data[col].quantile(0.95)].index) + return housing_data + +outlier_removal(housing_data) +print(housing_data.shape) + + diff --git a/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..9a376aa Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc new file mode 100644 index 0000000..452d7b8 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..334ff6f Binary files /dev/null and b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/build.cpython-36.pyc b/q03_skewness_log/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..4f9e00b Binary files /dev/null and b/q03_skewness_log/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_log/build.py b/q03_skewness_log/build.py index f008d0f..6c0a6c7 100644 --- a/q03_skewness_log/build.py +++ b/q03_skewness_log/build.py @@ -1,3 +1,4 @@ +# %load q03_skewness_log/build.py from scipy.stats import skew import pandas as pd import numpy as np @@ -6,3 +7,9 @@ # Write code here: +def skewness_log(data): + return skew(np.log(data['GrLivArea'])), skew(np.log(data['SalePrice'])) + +skewness_log(data) + + diff --git a/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..372441c Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc new file mode 100644 index 0000000..4525120 Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..474d0e2 Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..982f90f Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/build.py b/q03_skewness_sqrt/build.py index 4bdb0e4..f58b017 100644 --- a/q03_skewness_sqrt/build.py +++ b/q03_skewness_sqrt/build.py @@ -1,3 +1,4 @@ +# %load q03_skewness_sqrt/build.py # Default imports from scipy.stats import skew import pandas as pd @@ -8,3 +9,12 @@ # Write your Solution Here: +def skewness_sqrt(ny_housing): + skewed_sqrt_val2 = skew(np.sqrt(ny_housing['SalePrice'])) + skewed_sqrt_val1 = skew(np.sqrt(ny_housing['GrLivArea'])) + return skewed_sqrt_val1,skewed_sqrt_val2 + +skewness_sqrt(ny_housing) + + + diff --git a/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..d4a4a94 Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc new file mode 100644 index 0000000..8010a1f Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc differ