diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..2dc49e0 Binary files /dev/null and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..6649216 Binary files /dev/null and b/q01_missing_value/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/build.cpython-36.pyc b/q01_missing_value/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..4e64d73 Binary files /dev/null and b/q01_missing_value/__pycache__/build.cpython-36.pyc differ diff --git a/q01_missing_value/build.py b/q01_missing_value/build.py index 7dc4d18..6100490 100644 --- a/q01_missing_value/build.py +++ b/q01_missing_value/build.py @@ -1,10 +1,25 @@ +# %load q01_missing_value/build.py # Default imports import pandas as pd - +from sklearn.preprocessing import Imputer # Data loading ny_housing = pd.read_csv('data/train.csv') # Selecting 4 most relevant variables along with target variable from the dataset fot the Cleaning and Preprocessing. housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] +def imputation(df): + numeric_features = [a for a in range(len(df.dtypes)) if df.dtypes[a] in ['int64','float64']] + numeric_df = df.iloc[:, numeric_features] + cat_features = df.columns.difference(df.columns[numeric_features]) + cat_df = df.loc[:,cat_features] + numeric_imputer = Imputer(missing_values = 'NaN', strategy='mean') + numeric_imputed_df = pd.DataFrame(numeric_imputer.fit_transform(numeric_df)) + numeric_imputed_df.columns = numeric_df.columns + numeric_imputed_df.index = numeric_df.index + for feature in cat_features: + cat_df[feature] = cat_df[feature].fillna(cat_df[feature].mode()[0]) + return numeric_imputed_df, cat_df + + + -# Write your code here: diff --git a/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..ceb6b46 Binary files /dev/null and b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc new file mode 100644 index 0000000..2ac86c0 Binary files /dev/null and b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f79872c Binary files /dev/null and b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/build.cpython-36.pyc b/q02_outlier_removal/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..004351a Binary files /dev/null and b/q02_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q02_outlier_removal/build.py b/q02_outlier_removal/build.py index 74df5f2..8140ac5 100644 --- a/q02_outlier_removal/build.py +++ b/q02_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q02_outlier_removal/build.py # Default imports import pandas as pd @@ -6,5 +7,16 @@ # Selecting 4 most relevant variables from the dataset fot the Cleaning and Preprocessing. housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] +def outlier_removal(df): + qv = 0.95 + df_qv = df.quantile(q=qv, axis=0, numeric_only=True, interpolation='linear') + numeric_features = [a for a in range(len(df.dtypes)) if df.dtypes[a] in ['int64','float64']] + numeric_df = df.iloc[:, numeric_features] + for feature in numeric_df.columns: + df=df.drop(df[df[feature]>df_qv[feature]].index) + return df + + + + -# Write your code here: diff --git a/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..a450ad3 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc new file mode 100644 index 0000000..181c8a9 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..2e34c55 Binary files /dev/null and b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/build.cpython-36.pyc b/q03_skewness_log/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..6c4261a Binary files /dev/null and b/q03_skewness_log/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_log/build.py b/q03_skewness_log/build.py index f008d0f..c2ecbc9 100644 --- a/q03_skewness_log/build.py +++ b/q03_skewness_log/build.py @@ -1,8 +1,15 @@ +# %load q03_skewness_log/build.py from scipy.stats import skew import pandas as pd import numpy as np data = pd.read_csv('data/train.csv') +def skewness_log(df): + df_trans = df.copy() + df_trans['SalePrice'] = np.log(df_trans['SalePrice']) + df_trans['GrLivArea'] = np.log(df_trans['GrLivArea']) + return skew(df_trans['GrLivArea']), skew(df_trans['SalePrice']) + + -# Write code here: diff --git a/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..a9b9731 Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc new file mode 100644 index 0000000..4a21666 Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..a28fd7c Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..b7486c1 Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/build.py b/q03_skewness_sqrt/build.py index 4bdb0e4..ba215a4 100644 --- a/q03_skewness_sqrt/build.py +++ b/q03_skewness_sqrt/build.py @@ -1,3 +1,4 @@ +# %load q03_skewness_sqrt/build.py # Default imports from scipy.stats import skew import pandas as pd @@ -5,6 +6,11 @@ ny_housing = pd.read_csv('data/train.csv') +def skewness_sqrt(df): + df_trans = df.copy() + df_trans['SalePrice'] = np.sqrt(df_trans['SalePrice']) + df_trans['GrLivArea'] = np.sqrt(df_trans['GrLivArea']) + return skew(df_trans['GrLivArea']), skew(df_trans['SalePrice']) + -# Write your Solution Here: diff --git a/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..aea5c48 Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc new file mode 100644 index 0000000..4de86d2 Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/__init__.cpython-36.pyc b/q04_encoding/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..3a04174 Binary files /dev/null and b/q04_encoding/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/build.cpython-36.pyc b/q04_encoding/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..3beea3f Binary files /dev/null and b/q04_encoding/__pycache__/build.cpython-36.pyc differ diff --git a/q04_encoding/build.py b/q04_encoding/build.py index a52c57f..c291f38 100644 --- a/q04_encoding/build.py +++ b/q04_encoding/build.py @@ -1,3 +1,4 @@ +# %load q04_encoding/build.py # Default imports import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -5,6 +6,12 @@ ny_housing = pd.read_csv('data/train.csv') housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] +def encoding(df): + le = LabelEncoder() + df['LotShape_Label'] = le.fit_transform(df['LotShape']) + df=df.join(pd.get_dummies(df['GarageType'], drop_first=True)) + return df + + -# Write your code here: diff --git a/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..9d9810e Binary files /dev/null and b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc new file mode 100644 index 0000000..c414537 Binary files /dev/null and b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc differ