diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..af63a6f Binary files /dev/null and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..354530f Binary files /dev/null and b/q01_missing_value/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/build.cpython-36.pyc b/q01_missing_value/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..eff5072 Binary files /dev/null and b/q01_missing_value/__pycache__/build.cpython-36.pyc differ diff --git a/q01_missing_value/build.py b/q01_missing_value/build.py index 7dc4d18..1edbf02 100644 --- a/q01_missing_value/build.py +++ b/q01_missing_value/build.py @@ -1,6 +1,8 @@ +# %load q01_missing_value/build.py # Default imports +import numpy as np import pandas as pd - +from sklearn.preprocessing import Imputer # Data loading ny_housing = pd.read_csv('data/train.csv') # Selecting 4 most relevant variables along with target variable from the dataset fot the Cleaning and Preprocessing. @@ -8,3 +10,24 @@ # Write your code here: +def imputation(ny_housing): +# mean=housing_data['MasVnrArea'].loc[housing_data['MasVnrArea'].notnull()].mean() +# housing_data['MasVnrArea']=housing_data['MasVnrArea'].fillna(mean) +# highly_occured=housing_data['GarageType'].loc[housing_data['GarageType'].notnull()].value_counts().index[0] +# housing_data['GarageType']=housing_data['GarageType'].fillna(highly_occured) +# return housing_data[['MasVnrArea','GarageType']] + + imp_mean = Imputer(missing_values = 'NaN', strategy='mean') + imp_mean.fit(housing_data[['MasVnrArea']]) + housing_data[['MasVnrArea']] = imp_mean.transform(housing_data[['MasVnrArea']]) + housing_data['GarageType'] = housing_data['GarageType'] .fillna( housing_data['GarageType'].mode()[0]) + return pd.DataFrame(housing_data['MasVnrArea']),pd.DataFrame(housing_data['GarageType']) +c=imputation(ny_housing) +c + + + + + + + diff --git a/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..cca5f8f Binary files /dev/null and b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc new file mode 100644 index 0000000..689bc90 Binary files /dev/null and b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..7d88578 Binary files /dev/null and b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/build.cpython-36.pyc b/q02_outlier_removal/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..d519716 Binary files /dev/null and b/q02_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q02_outlier_removal/build.py b/q02_outlier_removal/build.py index 74df5f2..df0e65d 100644 --- a/q02_outlier_removal/build.py +++ b/q02_outlier_removal/build.py @@ -1,10 +1,29 @@ +# %load q02_outlier_removal/build.py # Default imports import pandas as pd - +import seaborn as sns +import matplotlib.pyplot as plt # Data -ny_housing = pd.read_csv('data/train.csv') +df = pd.read_csv('data/train.csv') # Selecting 4 most relevant variables from the dataset fot the Cleaning and Preprocessing. -housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] - +housing_data = df[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] # Write your code here: +def outlier_removal(housing_data): + mean=housing_data['MasVnrArea'].loc[housing_data['MasVnrArea'].notnull()].mean() + housing_data['MasVnrArea']=housing_data['MasVnrArea'].fillna(mean) + highly_occured=housing_data['GarageType'].loc[housing_data['GarageType'].notnull()].value_counts().index[0] + housing_data['GarageType']=housing_data['GarageType'].fillna(highly_occured) + + return housing_data[(housing_data['MasVnrArea']<=housing_data['MasVnrArea'].quantile(0.95)) & (housing_data['GrLivArea']<=housing_data['GrLivArea'].quantile(0.95)) & (housing_data['SalePrice']<=housing_data['SalePrice'].quantile(0.95)) ] + + +c=outlier_removal(housing_data) +c + + + + + + + diff --git a/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f6d2f0d Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc new file mode 100644 index 0000000..aeebef9 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..97499f8 Binary files /dev/null and b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/build.cpython-36.pyc b/q03_skewness_log/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..f2fcba3 Binary files /dev/null and b/q03_skewness_log/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_log/build.py b/q03_skewness_log/build.py index f008d0f..eb9a356 100644 --- a/q03_skewness_log/build.py +++ b/q03_skewness_log/build.py @@ -1,8 +1,30 @@ +# %load q03_skewness_log/build.py from scipy.stats import skew import pandas as pd import numpy as np data = pd.read_csv('data/train.csv') +housing_data=data[['MasVnrArea','GrLivArea','LotShape','GarageType','SalePrice']] +def skewness_log(data): + housing_data=data[['MasVnrArea','GrLivArea','LotShape','GarageType','SalePrice']] + mean=housing_data['MasVnrArea'].loc[housing_data['MasVnrArea'].notnull()].mean() + housing_data['MasVnrArea']=housing_data['MasVnrArea'].fillna(mean) + highly_occured=housing_data['GarageType'].loc[housing_data['GarageType'].notnull()].value_counts().index[0] + housing_data['GarageType']=housing_data['GarageType'].fillna(highly_occured) + + + housing_data[(housing_data['MasVnrArea']<=housing_data['MasVnrArea'].quantile(0.95)) & (housing_data['GrLivArea']<=housing_data['GrLivArea'].quantile(0.95)) & (housing_data['SalePrice']<=housing_data['SalePrice'].quantile(0.95)) ] + + log_1=np.log(housing_data['SalePrice']) + log_2=np.log(housing_data['GrLivArea']) + +# housing_data['SalePrice']=housing_data['SalePrice'].loc[] + skew_1=log_1.skew() + skew_2=log_2.skew() + return skew_2,skew_1 + +c=skewness_log(data) +c + -# Write code here: diff --git a/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..479cd9a Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc new file mode 100644 index 0000000..328382e Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..b8b768f Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..5366662 Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/build.py b/q03_skewness_sqrt/build.py index 4bdb0e4..5aeee2e 100644 --- a/q03_skewness_sqrt/build.py +++ b/q03_skewness_sqrt/build.py @@ -1,10 +1,31 @@ +# %load q03_skewness_sqrt/build.py # Default imports from scipy.stats import skew import pandas as pd import numpy as np ny_housing = pd.read_csv('data/train.csv') +housing_data=ny_housing[['MasVnrArea','GrLivArea','LotShape','GarageType','SalePrice']] +def skewness_sqrt(ny_housing): + housing_data=ny_housing[['MasVnrArea','GrLivArea','LotShape','GarageType','SalePrice']] + mean=housing_data['MasVnrArea'].loc[housing_data['MasVnrArea'].notnull()].mean() + housing_data['MasVnrArea']=housing_data['MasVnrArea'].fillna(mean) + highly_occured=housing_data['GarageType'].loc[housing_data['GarageType'].notnull()].value_counts().index[0] + housing_data['GarageType']=housing_data['GarageType'].fillna(highly_occured) + + + housing_data[(housing_data['MasVnrArea']<=housing_data['MasVnrArea'].quantile(0.95)) & (housing_data['GrLivArea']<=housing_data['GrLivArea'].quantile(0.95)) & (housing_data['SalePrice']<=housing_data['SalePrice'].quantile(0.95)) ] + + s1=np.sqrt(housing_data['SalePrice']) + s2=np.sqrt(housing_data['GrLivArea']) + + skew1=s1.skew() + skew2=s2.skew() + + return skew2,skew1 + +c= skewness_sqrt(ny_housing) +c -# Write your Solution Here: diff --git a/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..4d485bd Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc new file mode 100644 index 0000000..00b57cb Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/__init__.cpython-36.pyc b/q04_encoding/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..afc3a2c Binary files /dev/null and b/q04_encoding/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/build.cpython-36.pyc b/q04_encoding/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..a7ab3bb Binary files /dev/null and b/q04_encoding/__pycache__/build.cpython-36.pyc differ diff --git a/q04_encoding/build.py b/q04_encoding/build.py index a52c57f..22a2a2c 100644 --- a/q04_encoding/build.py +++ b/q04_encoding/build.py @@ -1,3 +1,4 @@ +# %load q04_encoding/build.py # Default imports import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -5,6 +6,22 @@ ny_housing = pd.read_csv('data/train.csv') housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] +def encoding(housing_data): + housing_data=ny_housing[['MasVnrArea','GrLivArea','LotShape','GarageType','SalePrice']] + mean=housing_data['MasVnrArea'].loc[housing_data['MasVnrArea'].notnull()].mean() + housing_data['MasVnrArea']=housing_data['MasVnrArea'].fillna(mean) + highly_occured=housing_data['GarageType'].loc[housing_data['GarageType'].notnull()].value_counts().index[0] + housing_data['GarageType']=housing_data['GarageType'].fillna(highly_occured) + + + housing_data[(housing_data['MasVnrArea']<=housing_data['MasVnrArea'].quantile(0.95)) & (housing_data['GrLivArea']<=housing_data['GrLivArea'].quantile(0.95)) & (housing_data['SalePrice']<=housing_data['SalePrice'].quantile(0.95)) ] + + + + housing_data['LotShape']=pd.DataFrame(housing_data['LotShape'].reshape(-1,1)).apply(LabelEncoder().fit_transform) + c=pd.get_dummies(housing_data['GarageType']) + return pd.concat([housing_data,c],axis=1) +c=encoding(housing_data) +c -# Write your code here: diff --git a/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..09cc3f0 Binary files /dev/null and b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc new file mode 100644 index 0000000..4f260ed Binary files /dev/null and b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc differ