diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index abc397a..2e0dda9 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc index 460f88a..0603089 100644 Binary files a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/build.cpython-36.pyc b/q01_plot_corr/__pycache__/build.cpython-36.pyc index f4059a3..84e0d8a 100644 Binary files a/q01_plot_corr/__pycache__/build.cpython-36.pyc and b/q01_plot_corr/__pycache__/build.cpython-36.pyc differ diff --git a/q01_plot_corr/build.py b/q01_plot_corr/build.py index edc724a..4c205d3 100644 --- a/q01_plot_corr/build.py +++ b/q01_plot_corr/build.py @@ -1,5 +1,5 @@ -# Default imports import pandas as pd +import matplotlib.pyplot as plt from matplotlib.pyplot import yticks, xticks, subplots, set_cmap plt.switch_backend('agg') data = pd.read_csv('data/house_prices_multivariate.csv') @@ -9,8 +9,12 @@ def plot_corr(data, size=11): corr = data.corr() fig, ax = subplots(figsize=(size, size)) - set_cmap("YlOrRd") + set_cmap('YlOrRd') ax.matshow(corr) xticks(range(len(corr.columns)), corr.columns, rotation=90) yticks(range(len(corr.columns)), corr.columns) - return ax + + return + + + diff --git a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc index c4bc30d..63d745f 100644 Binary files a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc index 40d2b70..6b1ca9f 100644 Binary files a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc index 43047f0..5846346 100644 Binary files a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/build.cpython-36.pyc b/q02_best_k_features/__pycache__/build.cpython-36.pyc index 8372777..224a23d 100644 Binary files a/q02_best_k_features/__pycache__/build.cpython-36.pyc and b/q02_best_k_features/__pycache__/build.cpython-36.pyc differ diff --git a/q02_best_k_features/build.py b/q02_best_k_features/build.py index 9b1046a..a355137 100644 --- a/q02_best_k_features/build.py +++ b/q02_best_k_features/build.py @@ -1,12 +1,38 @@ +# %load q02_best_k_features/build.py # Default imports - import pandas as pd +import numpy as np +from sklearn.feature_selection import SelectPercentile +from sklearn.feature_selection import f_regression data = pd.read_csv('data/house_prices_multivariate.csv') -from sklearn.feature_selection import SelectPercentile -from sklearn.feature_selection import f_regression +def percentile_k_features(data, k=20): + + X = data.iloc[:,:-1] + y = data['SalePrice'] + f, _ = f_regression(X,y) + f = list(f) + xc = list(X.columns) + + f_sort = sorted(f) + xc_s = [x for _,x in sorted(zip(f, xc))] + req_len = int((k/100)*len(xc_s)) + 1 + + return xc_s[::-1][:req_len] + + +percentile_k_features(data) + + + + + + + + + + -# Write your solution here: diff --git a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc index 86a25cf..1c12994 100644 Binary files a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc new file mode 100644 index 0000000..b7d8fa3 Binary files /dev/null and b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..9ca7d7f Binary files /dev/null and b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/build.cpython-36.pyc b/q03_rf_rfe/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..bdf7d64 Binary files /dev/null and b/q03_rf_rfe/__pycache__/build.cpython-36.pyc differ diff --git a/q03_rf_rfe/build.py b/q03_rf_rfe/build.py index e8a8d20..11c82db 100644 --- a/q03_rf_rfe/build.py +++ b/q03_rf_rfe/build.py @@ -1,3 +1,4 @@ +# %load q03_rf_rfe/build.py # Default imports import pandas as pd @@ -6,6 +7,20 @@ from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier +def rf_rfe(data): + + X,y = data.iloc[:,:-1], data.iloc[:,-1] + model = RandomForestClassifier() + rfe = RFE(model, int(data.shape[1]/2)) + rfe.fit(X,y) + cols_list = rfe.get_support(indices=True) + cols_sort = [cols_list for _, cols_list in sorted(zip(rfe.ranking_[cols_list],cols_list))] + selected_cols = data.iloc[:,cols_sort] + + return list(selected_cols.columns.values) + +rf_rfe(data) + + -# Your solution code here diff --git a/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..dd7b570 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc new file mode 100644 index 0000000..c36a106 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..dd96301 Binary files /dev/null and b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/build.cpython-36.pyc b/q04_select_from_model/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..44d3d92 Binary files /dev/null and b/q04_select_from_model/__pycache__/build.cpython-36.pyc differ diff --git a/q04_select_from_model/build.py b/q04_select_from_model/build.py index 12dd1df..9ede1f4 100644 --- a/q04_select_from_model/build.py +++ b/q04_select_from_model/build.py @@ -1,3 +1,4 @@ +# %load q04_select_from_model/build.py # Default imports from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier @@ -6,5 +7,21 @@ data = pd.read_csv('data/house_prices_multivariate.csv') +def select_from_model(data): + X,y = data.iloc[:,:-1], data.iloc[:,-1] + model = RandomForestClassifier(random_state=9) + model.fit(X,y) + rfe = SelectFromModel(model, prefit=True) + + rfe.transform(X) + cols_list = rfe.get_support(indices=True) + features_selected = data.iloc[:,cols_list] + + return list(features_selected.columns.values) + +select_from_model(data) + + + + -# Your solution code here diff --git a/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..9adf2df Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc new file mode 100644 index 0000000..595ffdf Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..fb0cfb1 Binary files /dev/null and b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/build.cpython-36.pyc b/q05_forward_selected/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..6c6dff9 Binary files /dev/null and b/q05_forward_selected/__pycache__/build.cpython-36.pyc differ diff --git a/q05_forward_selected/build.py b/q05_forward_selected/build.py index 8816abd..ecb5408 100644 --- a/q05_forward_selected/build.py +++ b/q05_forward_selected/build.py @@ -1,3 +1,4 @@ +# %load q05_forward_selected/build.py # Default imports import pandas as pd from sklearn.linear_model import LinearRegression @@ -6,5 +7,52 @@ model = LinearRegression() +def forward_selected(data, model): + df = data + X = df.drop(df.columns[len(df.columns)-1], axis=1) + y = df.iloc[:,-1] + col_list = X.columns + def select_cols(df, col_list): + return df[[item for item in col_list]] + Variable_f, Variable_f_max = [], [] + Score_r2, Score_r2_max = [], [] + # the first feature + for index in range(len(col_list)): + feature_list=[col_list[index]] + new_X = select_cols(X,feature_list) + model.fit(new_X,y) + r2 = model.score(new_X,y) + Variable_f.append(new_X.columns[0]) + Score_r2.append(r2) + max_r2_score = max(Score_r2) + Variable_f_max = [Variable_f[Score_r2.index(max_r2_score)]] + Score_r2_max.append(max_r2_score) + #The next features + new_Variable_f_max, new_Variable_f, new_Score_r2, new_Score_r2_max, new_Variable_f_max_capture, new_Score_r2_max_capture = [], [], [], [], [], [] + new_Score_r2_max_capture.append(max_r2_score) + no_features_f_max = len(new_Variable_f) + new_Variable_f = Variable_f_max + def features(j): + feature_list=col_list[j] + return feature_list + for index in range(len(col_list)-1): + for k in range(len(col_list)-1): + if features(k) not in new_Variable_f: + new_Variable_f = Variable_f_max.copy() + new_Variable_f.append(features(k)) + new_X1 = select_cols(X,new_Variable_f) + model.fit(new_X1,y) + new_r2 = model.score(new_X1,y) + new_Score_r2.append(new_r2) + new_max_r2_score = max(new_Score_r2) + new_Variable_f_max.append(new_X1.columns) + largest_indice = new_Score_r2.index(new_max_r2_score) + new_Variable_f_max_capture =list(new_Variable_f_max[largest_indice]) + new_Score_r2_max_capture.append(new_max_r2_score) + Variable_f_max = new_Variable_f_max_capture.copy() + return Variable_f_max, new_Score_r2_max_capture + +forward_selected(data, model) + + -# Your solution code here diff --git a/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..293646d Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc new file mode 100644 index 0000000..833a0d0 Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc differ