diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index abc397a..2503bdb 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc index 460f88a..69c3d6c 100644 Binary files a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/build.cpython-36.pyc b/q01_plot_corr/__pycache__/build.cpython-36.pyc index f4059a3..6021904 100644 Binary files a/q01_plot_corr/__pycache__/build.cpython-36.pyc and b/q01_plot_corr/__pycache__/build.cpython-36.pyc differ diff --git a/q01_plot_corr/build.py b/q01_plot_corr/build.py index edc724a..544dfad 100644 --- a/q01_plot_corr/build.py +++ b/q01_plot_corr/build.py @@ -1,7 +1,8 @@ +# %load q01_plot_corr/build.py # Default imports import pandas as pd -from matplotlib.pyplot import yticks, xticks, subplots, set_cmap -plt.switch_backend('agg') +from matplotlib.pyplot import yticks, xticks, subplots, set_cmap,show +# plt.switch_backend('agg') data = pd.read_csv('data/house_prices_multivariate.csv') @@ -9,8 +10,13 @@ def plot_corr(data, size=11): corr = data.corr() fig, ax = subplots(figsize=(size, size)) - set_cmap("YlOrRd") + set_cmap('YlOrRd') ax.matshow(corr) xticks(range(len(corr.columns)), corr.columns, rotation=90) yticks(range(len(corr.columns)), corr.columns) - return ax + ax + show() + +plot_corr(data, size=11) + + diff --git a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc index c4bc30d..be2d453 100644 Binary files a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc index 40d2b70..27c7338 100644 Binary files a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc index 43047f0..fd006c2 100644 Binary files a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/build.cpython-36.pyc b/q02_best_k_features/__pycache__/build.cpython-36.pyc index 8372777..1a3bff0 100644 Binary files a/q02_best_k_features/__pycache__/build.cpython-36.pyc and b/q02_best_k_features/__pycache__/build.cpython-36.pyc differ diff --git a/q02_best_k_features/build.py b/q02_best_k_features/build.py index 9b1046a..cb1db0a 100644 --- a/q02_best_k_features/build.py +++ b/q02_best_k_features/build.py @@ -1,3 +1,4 @@ +# %load q02_best_k_features/build.py # Default imports import pandas as pd @@ -9,4 +10,33 @@ # Write your solution here: +def percentile_k_features(df,k = 20): + + X = df.iloc[:,:-1] + y = df.iloc[:,-1] + + selector = SelectPercentile(f_regression, k).fit(X, y) + #X_new = selector.transform(X) # Not needed here as we need the features and their scores. + + #We could have directly provided the list of columns selected by using the line below but the test case fails. + #Column list must be in descending order of scores. + ColumnsSelected = X.columns[selector.get_support()] + + + #Hence, the below trouble. + feature_score = selector.scores_ # This gives scores of each feature + feature_selected = X.columns.values # This gives the name of all original features/columns + bool_index = selector.get_support() # Boolean values viz. True/False against each feature of column. True indicating the feature being selected. + + #Zipping all the 3 values into a list + zipped_list = list(zip(feature_score,feature_selected,bool_index)) + + #Putting the list in dataframe after sorting it in descending order based on scores. + Final_df = pd.DataFrame(zipped_list).sort_values(by=0,ascending=False) + + return list(Final_df.loc[Final_df.iloc[:,2]][1]) + + +percentile_k_features(df = data,k = 20) + diff --git a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc index 86a25cf..247fa93 100644 Binary files a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc new file mode 100644 index 0000000..d78c737 Binary files /dev/null and b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..b771800 Binary files /dev/null and b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/build.cpython-36.pyc b/q03_rf_rfe/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..cf9e520 Binary files /dev/null and b/q03_rf_rfe/__pycache__/build.cpython-36.pyc differ diff --git a/q03_rf_rfe/build.py b/q03_rf_rfe/build.py index e8a8d20..aa047a1 100644 --- a/q03_rf_rfe/build.py +++ b/q03_rf_rfe/build.py @@ -1,6 +1,7 @@ +# %load q03_rf_rfe/build.py # Default imports import pandas as pd - +import numpy as np data = pd.read_csv('data/house_prices_multivariate.csv') from sklearn.feature_selection import RFE @@ -8,4 +9,19 @@ # Your solution code here +def rf_rfe(df): + + X = data[[col for col in data.columns if col != 'SalePrice']] + y = data['SalePrice'] + + RFC_model = RandomForestClassifier(random_state=9) + + RFE_model = RFE(estimator=RFC_model,n_features_to_select=None) + RFE_model.fit(X,y) + +# print(X.columns) +# print(np.array(RFE_model.support_)) +# print(np.array(RFE_model.ranking_)) + return list(X.columns[RFE_model.support_]) +rf_rfe(data) diff --git a/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..184579d Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc new file mode 100644 index 0000000..cf13a74 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..91fdd2e Binary files /dev/null and b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/build.cpython-36.pyc b/q04_select_from_model/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..006f455 Binary files /dev/null and b/q04_select_from_model/__pycache__/build.cpython-36.pyc differ diff --git a/q04_select_from_model/build.py b/q04_select_from_model/build.py index 12dd1df..e0308ff 100644 --- a/q04_select_from_model/build.py +++ b/q04_select_from_model/build.py @@ -1,3 +1,4 @@ +# %load q04_select_from_model/build.py # Default imports from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier @@ -6,5 +7,18 @@ data = pd.read_csv('data/house_prices_multivariate.csv') - # Your solution code here +def select_from_model(df): + X = data[[col for col in data.columns if col != 'SalePrice']] + y = data['SalePrice'] + + RFC_model = RandomForestClassifier(random_state=9) + RFC_model.fit(X,y) + + SFM_model = SelectFromModel(RFC_model,prefit=True) + #X_new = SFM_model.transform(X) #Not needed here. + + return list(X.columns[SFM_model.get_support()]) + +select_from_model(data) + diff --git a/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..80d1f6e Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc new file mode 100644 index 0000000..909f97b Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc differ