diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..47bb1a9 Binary files /dev/null and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..d3ac6a5 Binary files /dev/null and b/q01_missing_value/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/build.cpython-36.pyc b/q01_missing_value/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..1e4368f Binary files /dev/null and b/q01_missing_value/__pycache__/build.cpython-36.pyc differ diff --git a/q01_missing_value/build.py b/q01_missing_value/build.py index 7dc4d18..cb4dba5 100644 --- a/q01_missing_value/build.py +++ b/q01_missing_value/build.py @@ -1,3 +1,4 @@ +# %load q01_missing_value/build.py # Default imports import pandas as pd @@ -5,6 +6,19 @@ ny_housing = pd.read_csv('data/train.csv') # Selecting 4 most relevant variables along with target variable from the dataset fot the Cleaning and Preprocessing. housing_data = ny_housing[['MasVnrArea', 'GrLivArea', 'LotShape', 'GarageType', 'SalePrice']] +df = housing_data.copy() +df1 =df.select_dtypes(include=['int','float']) +# Write your code here: + +def imputation(df): + df1=df.select_dtypes(include=['int','float']) + df2=df.select_dtypes(include=['object']) + df2['LotShape']=df2['LotShape'].fillna(df2['LotShape'].mode().index[0]) + df2['GarageType']=df2['GarageType'].fillna(df2['GarageType'].value_counts().index[0]) + for col in df1.columns: + df1[col] = df1[col].fillna(df1[col].mean()) + + return df1,df2 + -# Write your code here: diff --git a/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..ab0d41c Binary files /dev/null and b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc new file mode 100644 index 0000000..9ac5220 Binary files /dev/null and b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..5dcfc6f Binary files /dev/null and b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/build.cpython-36.pyc b/q02_outlier_removal/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..83e911e Binary files /dev/null and b/q02_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q02_outlier_removal/build.py b/q02_outlier_removal/build.py index 74df5f2..dfd8780 100644 --- a/q02_outlier_removal/build.py +++ b/q02_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q02_outlier_removal/build.py # Default imports import pandas as pd @@ -8,3 +9,18 @@ # Write your code here: +def outlier_removal (dataset): + df= dataset.copy() + num_columns =df.select_dtypes(include=['float64','int64']) + + quantile_95= num_columns.quantile(0.95) + + for colname in num_columns: + quantile = quantile_95[ colname ] + df=df.drop(df[df[colname]>quantile].index) + + + return df +outlier_removal(housing_data) + + diff --git a/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..cc8037a Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc new file mode 100644 index 0000000..47f95b9 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..da6c5d0 Binary files /dev/null and b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/build.cpython-36.pyc b/q03_skewness_log/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..6b6a468 Binary files /dev/null and b/q03_skewness_log/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_log/build.py b/q03_skewness_log/build.py index f008d0f..42b6ab2 100644 --- a/q03_skewness_log/build.py +++ b/q03_skewness_log/build.py @@ -1,8 +1,15 @@ +# %load q03_skewness_log/build.py from scipy.stats import skew import pandas as pd import numpy as np data = pd.read_csv('data/train.csv') +def skewness_log(data): + + data['GrLivArea'] = np.log(data['GrLivArea']) + data['SalePrice'] = np.log(data['SalePrice']) + + return skew(data['GrLivArea']),skew(data['SalePrice']) +skewness_log(data) -# Write code here: diff --git a/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..8cc6160 Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc new file mode 100644 index 0000000..af1f7cf Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..91ab89f Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..e6dbc5d Binary files /dev/null and b/q03_skewness_sqrt/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/build.py b/q03_skewness_sqrt/build.py index 4bdb0e4..3a5da89 100644 --- a/q03_skewness_sqrt/build.py +++ b/q03_skewness_sqrt/build.py @@ -1,3 +1,4 @@ +# %load q03_skewness_sqrt/build.py # Default imports from scipy.stats import skew import pandas as pd @@ -7,4 +8,11 @@ # Write your Solution Here: +def skewness_sqrt(ny_housing): + + ny_housing['SalePrice'] = np.sqrt(ny_housing['SalePrice']) + ny_housing['GrLivArea'] = np.sqrt(ny_housing['GrLivArea']) + + return skew(ny_housing['GrLivArea']),skew(ny_housing['SalePrice']) + diff --git a/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..9b61d9d Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc new file mode 100644 index 0000000..de3b58e Binary files /dev/null and b/q03_skewness_sqrt/tests/__pycache__/test-q03_skewness_sqrt.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/__init__.cpython-36.pyc b/q04_encoding/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..67bdb54 Binary files /dev/null and b/q04_encoding/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/__pycache__/build.cpython-36.pyc b/q04_encoding/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..1628658 Binary files /dev/null and b/q04_encoding/__pycache__/build.cpython-36.pyc differ diff --git a/q04_encoding/build.py b/q04_encoding/build.py index a52c57f..f055131 100644 --- a/q04_encoding/build.py +++ b/q04_encoding/build.py @@ -1,3 +1,4 @@ +# %load q04_encoding/build.py # Default imports import pandas as pd from sklearn.preprocessing import LabelEncoder @@ -7,4 +8,11 @@ # Write your code here: +def encoding(housing_data): + + label_encoder = LabelEncoder() + housing_data['LotShape_label'] = label_encoder.fit_transform(housing_data['LotShape']) + housing_data = pd.get_dummies(housing_data, columns=['GarageType']) + return housing_data + diff --git a/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f9857cd Binary files /dev/null and b/q04_encoding/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc new file mode 100644 index 0000000..7abd84d Binary files /dev/null and b/q04_encoding/tests/__pycache__/test_q04_encoding.cpython-36.pyc differ