diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..bfdd6c3 Binary files /dev/null and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e2c3a03 Binary files /dev/null and b/q01_missing_value/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/build.cpython-36.pyc b/q01_missing_value/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..ebd69e1 Binary files /dev/null and b/q01_missing_value/__pycache__/build.cpython-36.pyc differ diff --git a/q01_missing_value/build.py b/q01_missing_value/build.py index 7dc4d18..3e98651 100644 --- a/q01_missing_value/build.py +++ b/q01_missing_value/build.py @@ -1,6 +1,9 @@ +# %load q01_missing_value/build.py # Default imports import pandas as pd +from sklearn.preprocessing import Imputer + # Data loading ny_housing = pd.read_csv('data/train.csv') # Selecting 4 most relevant variables along with target variable from the dataset fot the Cleaning and Preprocessing. @@ -8,3 +11,16 @@ # Write your code here: +def imputation(housing_data): + df = housing_data + imp = Imputer(missing_values=float('NaN'),strategy='mean', axis=0) + df['MasVnrArea'] = imp.fit_transform(df[['MasVnrArea']]) + df['GrLivArea'] = imp.fit_transform(df[['GrLivArea']]) + df['SalePrice'] = imp.fit_transform(df[['SalePrice']]) + df['LotShape'] = df['LotShape'].fillna(df['LotShape'].mode()[0]) + df['GarageType'] = df['GarageType'].fillna(df['GarageType'].mode()[0]) + return df[['MasVnrArea', 'GrLivArea', 'SalePrice']],df[['LotShape', 'GarageType']] +#imputation(housing_data) + + + diff --git a/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..3585794 Binary files /dev/null and b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc new file mode 100644 index 0000000..56f9f54 Binary files /dev/null and b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e599015 Binary files /dev/null and b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/build.cpython-36.pyc b/q02_outlier_removal/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..9dcf93b Binary files /dev/null and b/q02_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q02_outlier_removal/build.py b/q02_outlier_removal/build.py index 74df5f2..8e2507a 100644 --- a/q02_outlier_removal/build.py +++ b/q02_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q02_outlier_removal/build.py # Default imports import pandas as pd @@ -6,5 +7,16 @@ # Selecting 4 most relevant variables from the dataset fot the Cleaning and Preprocessing. housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] - # Write your code here: +def outlier_removal(housing_data): + df = housing_data + df = df.drop(df[(df['MasVnrArea']>df['MasVnrArea'].quantile(0.95)) | (df['GrLivArea']>df['GrLivArea'].quantile(0.95)) | (df['SalePrice']>df['SalePrice'].quantile(0.95))].index) + + #print (df) + + return df + +outlier_removal(housing_data) + + + diff --git a/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..d770a32 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc new file mode 100644 index 0000000..c3a08b7 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..5b0a838 Binary files /dev/null and b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/build.cpython-36.pyc b/q03_skewness_log/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..5345c7f Binary files /dev/null and b/q03_skewness_log/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_log/build.py b/q03_skewness_log/build.py index f008d0f..1d0932b 100644 --- a/q03_skewness_log/build.py +++ b/q03_skewness_log/build.py @@ -1,8 +1,26 @@ +# %load q03_skewness_log/build.py from scipy.stats import skew import pandas as pd import numpy as np data = pd.read_csv('data/train.csv') - # Write code here: + +def skewness_log(data): + target1 = np.log(data['SalePrice']) + sk_val1 = skew(target1) + + target2 = np.log(data['GrLivArea']) + sk_val2 = skew(target2) + + print (sk_val1,sk_val2) + + return sk_val2,sk_val1 + +skewness_log(data) + + + + + diff --git a/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..5da56be Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc new file mode 100644 index 0000000..4429949 Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..985c96c Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..3c7a190 Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/build.py b/q03_skewness_sqrt/build.py index 4bdb0e4..edee4cf 100644 --- a/q03_skewness_sqrt/build.py +++ b/q03_skewness_sqrt/build.py @@ -1,10 +1,25 @@ +# %load q03_skewness_sqrt/build.py # Default imports from scipy.stats import skew import pandas as pd import numpy as np +from math import sqrt ny_housing = pd.read_csv('data/train.csv') - # Write your Solution Here: +def skewness_sqrt(ny_housing): + + target1 = np.sqrt(ny_housing['SalePrice']) + sk_val1 = skew(target1) + + target2 = np.sqrt(ny_housing['GrLivArea']) + sk_val2 = skew(target2) + + #print (sk_val2,sk_val1) + + return sk_val2,sk_val1 + +skewness_sqrt(ny_housing) + diff --git a/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..d187664 Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc new file mode 100644 index 0000000..2e1d768 Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/__init__.cpython-36.pyc b/q04_encoding/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..d950306 Binary files /dev/null and b/q04_encoding/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/build.cpython-36.pyc b/q04_encoding/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..edbd127 Binary files /dev/null and b/q04_encoding/__pycache__/build.cpython-36.pyc differ diff --git a/q04_encoding/build.py b/q04_encoding/build.py index a52c57f..16ac59e 100644 --- a/q04_encoding/build.py +++ b/q04_encoding/build.py @@ -1,3 +1,4 @@ +# %load q04_encoding/build.py # Default imports import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -5,6 +6,20 @@ ny_housing = pd.read_csv('data/train.csv') housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] - # Write your code here: +def encoding(housing_data): + lablel_encoder = LabelEncoder() + housing_data['LotShape'] = lablel_encoder.fit_transform(housing_data['LotShape']) + + housing_data = pd.get_dummies(housing_data,dummy_na=True) + + #print (housing_data) + + return housing_data + +encoding(housing_data) + + + + diff --git a/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..545974f Binary files /dev/null and b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc new file mode 100644 index 0000000..cd9abb5 Binary files /dev/null and b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc differ