diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index abc397a..699a22d 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc index 460f88a..8f1fb6c 100644 Binary files a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/build.cpython-36.pyc b/q01_plot_corr/__pycache__/build.cpython-36.pyc index f4059a3..e3a1fc4 100644 Binary files a/q01_plot_corr/__pycache__/build.cpython-36.pyc and b/q01_plot_corr/__pycache__/build.cpython-36.pyc differ diff --git a/q01_plot_corr/build.py b/q01_plot_corr/build.py index edc724a..37c509e 100644 --- a/q01_plot_corr/build.py +++ b/q01_plot_corr/build.py @@ -1,16 +1,20 @@ +# %load q01_plot_corr/build.py # Default imports import pandas as pd from matplotlib.pyplot import yticks, xticks, subplots, set_cmap +import matplotlib.pyplot as plt plt.switch_backend('agg') data = pd.read_csv('data/house_prices_multivariate.csv') - # Write your solution here: def plot_corr(data, size=11): corr = data.corr() fig, ax = subplots(figsize=(size, size)) - set_cmap("YlOrRd") + set_cmap('YlOrRd') ax.matshow(corr) xticks(range(len(corr.columns)), corr.columns, rotation=90) yticks(range(len(corr.columns)), corr.columns) return ax + + + diff --git a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc index c4bc30d..8f1b325 100644 Binary files a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc index 40d2b70..e7aaa52 100644 Binary files a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc index 43047f0..1c51192 100644 Binary files a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/build.cpython-36.pyc b/q02_best_k_features/__pycache__/build.cpython-36.pyc index 8372777..3fa9c11 100644 Binary files a/q02_best_k_features/__pycache__/build.cpython-36.pyc and b/q02_best_k_features/__pycache__/build.cpython-36.pyc differ diff --git a/q02_best_k_features/build.py b/q02_best_k_features/build.py index 9b1046a..f2ab766 100644 --- a/q02_best_k_features/build.py +++ b/q02_best_k_features/build.py @@ -1,12 +1,20 @@ +# %load q02_best_k_features/build.py # Default imports import pandas as pd data = pd.read_csv('data/house_prices_multivariate.csv') -from sklearn.feature_selection import SelectPercentile -from sklearn.feature_selection import f_regression +from sklearn.feature_selection import SelectPercentile, f_regression, SelectFromModel + +def percentile_k_features(df, k=20): + selector = SelectPercentile(f_regression, percentile=k) + X,y = df.iloc[:,:-1], df.iloc[:,-1] + selector.fit(X,y) + idx_selected = selector.get_support(indices=True) + idx_sorted = [idx_selected for _, idx_selected in sorted(zip(selector.scores_[idx_selected], idx_selected), reverse=True)] + features_train = df.iloc[:,idx_sorted] + return list(features_train.columns.values) -# Write your solution here: diff --git a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc index 86a25cf..68d5c1c 100644 Binary files a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc new file mode 100644 index 0000000..c824be1 Binary files /dev/null and b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..18dd193 Binary files /dev/null and b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/build.cpython-36.pyc b/q03_rf_rfe/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..6fd8453 Binary files /dev/null and b/q03_rf_rfe/__pycache__/build.cpython-36.pyc differ diff --git a/q03_rf_rfe/build.py b/q03_rf_rfe/build.py index e8a8d20..8086cb0 100644 --- a/q03_rf_rfe/build.py +++ b/q03_rf_rfe/build.py @@ -1,3 +1,4 @@ +# %load q03_rf_rfe/build.py # Default imports import pandas as pd @@ -6,6 +7,17 @@ from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier +def rf_rfe(df): + model = RandomForestClassifier() + X,y = df.iloc[:,:-1], df.iloc[:,-1] + selector = RFE(model, int(df.shape[1]/2)) + selector.fit(X,y) + idx_selected = selector.get_support(indices=True) + idx_sorted = [idx_selected for _, idx_selected in sorted(zip(selector.ranking_[idx_selected],idx_selected))] + features_train = df.iloc[:,idx_sorted] + return list(features_train.columns.values) + + + -# Your solution code here diff --git a/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..bd672e9 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc new file mode 100644 index 0000000..b33b374 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..3e3cece Binary files /dev/null and b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/build.cpython-36.pyc b/q04_select_from_model/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..9d4056e Binary files /dev/null and b/q04_select_from_model/__pycache__/build.cpython-36.pyc differ diff --git a/q04_select_from_model/build.py b/q04_select_from_model/build.py index 12dd1df..9a9e736 100644 --- a/q04_select_from_model/build.py +++ b/q04_select_from_model/build.py @@ -1,3 +1,4 @@ +# %load q04_select_from_model/build.py # Default imports from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier @@ -6,5 +7,17 @@ data = pd.read_csv('data/house_prices_multivariate.csv') +def select_from_model(df): + X,y = df.iloc[:,:-1], df.iloc[:,-1] + clf = RandomForestClassifier(random_state=9) + clf.fit(X, y) + selector = SelectFromModel(clf, prefit=True) + selector.transform(X) + idx_selected = selector.get_support(indices=True) + features_train = df.iloc[:,idx_selected] + return list(features_train.columns.values) + + + + -# Your solution code here diff --git a/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..838b094 Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc new file mode 100644 index 0000000..8b31f99 Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..5053dd1 Binary files /dev/null and b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/build.cpython-36.pyc b/q05_forward_selected/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..3cf6cfa Binary files /dev/null and b/q05_forward_selected/__pycache__/build.cpython-36.pyc differ diff --git a/q05_forward_selected/build.py b/q05_forward_selected/build.py index 8816abd..33e2e35 100644 --- a/q05_forward_selected/build.py +++ b/q05_forward_selected/build.py @@ -1,10 +1,59 @@ +# %load q05_forward_selected/build.py # Default imports +from greyatomlib.feature_selection.q05_forward_selected.build import forward_selected import pandas as pd +import numpy as np +import math from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error, r2_score data = pd.read_csv('data/house_prices_multivariate.csv') -model = LinearRegression() +def foward_selected(df, model=LinearRegression()): + X = df.drop(df.columns[len(df.columns)-1], axis=1) + y = df.iloc[:,-1] + col_list = X.columns + def select_cols(df, col_list): + return df[[item for item in col_list]] + Variable_f, Variable_f_max = [], [] + Score_r2, Score_r2_max = [], [] + # the first feature + for index in range(len(col_list)): + feature_list=[col_list[index]] + new_X = select_cols(X,feature_list) + model.fit(new_X,y) + r2 = model.score(new_X,y) + Variable_f.append(new_X.columns[0]) + Score_r2.append(r2) + max_r2_score = max(Score_r2) + Variable_f_max = [Variable_f[Score_r2.index(max_r2_score)]] + Score_r2_max.append(max_r2_score) + #The next features + new_Variable_f_max, new_Variable_f, new_Score_r2, new_Score_r2_max, new_Variable_f_max_capture, new_Score_r2_max_capture = [], [], [], [], [], [] + new_Score_r2_max_capture.append(max_r2_score) + no_features_f_max = len(new_Variable_f) + new_Variable_f = Variable_f_max + def features(j): + feature_list=col_list[j] + return feature_list + for index in range(len(col_list)-1): + for k in range(len(col_list)-1): + if features(k) not in new_Variable_f: + new_Variable_f = Variable_f_max.copy() + new_Variable_f.append(features(k)) + new_X1 = select_cols(X,new_Variable_f) + model.fit(new_X1,y) + new_r2 = model.score(new_X1,y) + new_Score_r2.append(new_r2) + new_max_r2_score = max(new_Score_r2) + new_Variable_f_max.append(new_X1.columns) + largest_indice = new_Score_r2.index(new_max_r2_score) + new_Variable_f_max_capture =list(new_Variable_f_max[largest_indice]) + new_Score_r2_max_capture.append(new_max_r2_score) + Variable_f_max = new_Variable_f_max_capture.copy() + return Variable_f_max, new_Score_r2_max_capture +foward_selected(data, LinearRegression()) + + -# Your solution code here diff --git a/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..9fb0a29 Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc new file mode 100644 index 0000000..94324f2 Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc differ