diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..4ee3283 Binary files /dev/null and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..37bcdbc Binary files /dev/null and b/q01_missing_value/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/__pycache__/build.cpython-36.pyc b/q01_missing_value/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..db40bf2 Binary files /dev/null and b/q01_missing_value/__pycache__/build.cpython-36.pyc differ diff --git a/q01_missing_value/build.py b/q01_missing_value/build.py index 7dc4d18..0ed0682 100644 --- a/q01_missing_value/build.py +++ b/q01_missing_value/build.py @@ -1,6 +1,7 @@ +# %load q01_missing_value/build.py # Default imports import pandas as pd - +from sklearn.preprocessing import Imputer # Data loading ny_housing = pd.read_csv('data/train.csv') # Selecting 4 most relevant variables along with target variable from the dataset fot the Cleaning and Preprocessing. @@ -8,3 +9,15 @@ # Write your code here: +def imputation(housing_data): + df = housing_data + imp = Imputer(missing_values=float('NaN'),strategy='mean', axis=0) + df['MasVnrArea'] = imp.fit_transform(df[['MasVnrArea']]) + df['GrLivArea'] = imp.fit_transform(df[['GrLivArea']]) + df['SalePrice'] = imp.fit_transform(df[['SalePrice']]) + df['LotShape'] = df['LotShape'].fillna(df['LotShape'].mode()[0]) + df['GarageType'] = df['GarageType'].fillna(df['GarageType'].mode()[0]) + return df[['MasVnrArea', 'GrLivArea', 'SalePrice']],df[['LotShape', 'GarageType']] +#imputation(housing_data) + + diff --git a/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f4bac6d Binary files /dev/null and b/q01_missing_value/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc new file mode 100644 index 0000000..26ae45a Binary files /dev/null and b/q01_missing_value/tests/__pycache__/test_q01_imputation.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..bb5bb6a Binary files /dev/null and b/q02_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/__pycache__/build.cpython-36.pyc b/q02_outlier_removal/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..c9f4067 Binary files /dev/null and b/q02_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q02_outlier_removal/build.py b/q02_outlier_removal/build.py index 74df5f2..a45721d 100644 --- a/q02_outlier_removal/build.py +++ b/q02_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q02_outlier_removal/build.py # Default imports import pandas as pd @@ -8,3 +9,13 @@ # Write your code here: +def outlier_removal(df): + + num_columns =df.select_dtypes(include=['float64','int64']) + quantile_95= num_columns.quantile(0.95) + for colname in num_columns: + quantile = quantile_95[colname] + df=df.drop(df[df[colname]>quantile].index) + return df + + diff --git a/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..680d671 Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc new file mode 100644 index 0000000..d69e1bb Binary files /dev/null and b/q02_outlier_removal/tests/__pycache__/test_q02_outlier_removal.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..805dcd1 Binary files /dev/null and b/q03_skewness_log/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/__pycache__/build.cpython-36.pyc b/q03_skewness_log/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..85f86eb Binary files /dev/null and b/q03_skewness_log/__pycache__/build.cpython-36.pyc differ diff --git a/q03_skewness_log/build.py b/q03_skewness_log/build.py index f008d0f..990469e 100644 --- a/q03_skewness_log/build.py +++ b/q03_skewness_log/build.py @@ -1,3 +1,4 @@ +# %load q03_skewness_log/build.py from scipy.stats import skew import pandas as pd import numpy as np @@ -6,3 +7,13 @@ # Write code here: +def skewness_log(data): + data['GrLivAreaLT'] = np.log(data['GrLivArea']) + skewed_grLiv = skew(data['GrLivAreaLT']) + + data['SalePriceLT'] = np.log(data['SalePrice']) + skewed_SaleP = skew(data['SalePriceLT']) + + return skewed_grLiv, skewed_SaleP + + diff --git a/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e8f112c Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc new file mode 100644 index 0000000..6c631a0 Binary files /dev/null and b/q03_skewness_log/tests/__pycache__/test_q03_skewness_log.cpython-36.pyc differ