diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index abc397a..0f30443 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc index 460f88a..2e4c33a 100644 Binary files a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/__pycache__/build.cpython-36.pyc b/q01_plot_corr/__pycache__/build.cpython-36.pyc index f4059a3..fa04d6d 100644 Binary files a/q01_plot_corr/__pycache__/build.cpython-36.pyc and b/q01_plot_corr/__pycache__/build.cpython-36.pyc differ diff --git a/q01_plot_corr/build.py b/q01_plot_corr/build.py index edc724a..fd0148d 100644 --- a/q01_plot_corr/build.py +++ b/q01_plot_corr/build.py @@ -1,7 +1,8 @@ +# %load q01_plot_corr/build.py # Default imports import pandas as pd from matplotlib.pyplot import yticks, xticks, subplots, set_cmap -plt.switch_backend('agg') +#plt.switch_backend('agg') data = pd.read_csv('data/house_prices_multivariate.csv') @@ -9,8 +10,11 @@ def plot_corr(data, size=11): corr = data.corr() fig, ax = subplots(figsize=(size, size)) - set_cmap("YlOrRd") + set_cmap('YlOrRd') ax.matshow(corr) xticks(range(len(corr.columns)), corr.columns, rotation=90) yticks(range(len(corr.columns)), corr.columns) return ax + + + diff --git a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc index c4bc30d..a06d7a3 100644 Binary files a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc index 40d2b70..e1e23ed 100644 Binary files a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc and b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc index 43047f0..1af45a5 100644 Binary files a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/__pycache__/build.cpython-36.pyc b/q02_best_k_features/__pycache__/build.cpython-36.pyc index 8372777..8a5abd9 100644 Binary files a/q02_best_k_features/__pycache__/build.cpython-36.pyc and b/q02_best_k_features/__pycache__/build.cpython-36.pyc differ diff --git a/q02_best_k_features/build.py b/q02_best_k_features/build.py index 9b1046a..73cc4b4 100644 --- a/q02_best_k_features/build.py +++ b/q02_best_k_features/build.py @@ -1,6 +1,8 @@ +# %load q02_best_k_features/build.py # Default imports import pandas as pd +import numpy as np data = pd.read_csv('data/house_prices_multivariate.csv') @@ -9,4 +11,19 @@ # Write your solution here: +def percentile_k_features(df,k=20): + X=data.iloc[:,:-1] + y=data.iloc[:,-1] + select=SelectPercentile(f_regression, percentile=k) + X_new=select.fit_transform(X,y) + names=X.columns.values[select.get_support()] + scores=select.scores_[select.get_support()] + l=list(zip(names,scores)) + df=pd.DataFrame(data=l) + df=df.sort_values([1],ascending=False) + imp_features=df.iloc[:,0].values.tolist() + + return imp_features + + diff --git a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc index 86a25cf..1a91d5c 100644 Binary files a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc and b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc new file mode 100644 index 0000000..cf4a041 Binary files /dev/null and b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..8380559 Binary files /dev/null and b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/__pycache__/build.cpython-36.pyc b/q03_rf_rfe/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..d349f49 Binary files /dev/null and b/q03_rf_rfe/__pycache__/build.cpython-36.pyc differ diff --git a/q03_rf_rfe/build.py b/q03_rf_rfe/build.py index e8a8d20..03d147e 100644 --- a/q03_rf_rfe/build.py +++ b/q03_rf_rfe/build.py @@ -1,6 +1,7 @@ +# %load q03_rf_rfe/build.py # Default imports import pandas as pd - +import numpy as np data = pd.read_csv('data/house_prices_multivariate.csv') from sklearn.feature_selection import RFE @@ -8,4 +9,15 @@ # Your solution code here +def rf_rfe(df): + X=data.iloc[:,:-1] + y=data.iloc[:,-1] + model=RandomForestClassifier() + rfe = RFE(model, 17) + rfe = rfe.fit(X, y) + r=rfe.ranking_ + l=np.where(r==1)[0].tolist() + top_features=X.iloc[:,l].columns.values.tolist() + return top_features + diff --git a/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..6313173 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc new file mode 100644 index 0000000..c0e5a55 Binary files /dev/null and b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e436dce Binary files /dev/null and b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/__pycache__/build.cpython-36.pyc b/q04_select_from_model/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..6cc00d1 Binary files /dev/null and b/q04_select_from_model/__pycache__/build.cpython-36.pyc differ diff --git a/q04_select_from_model/build.py b/q04_select_from_model/build.py index 12dd1df..bc62800 100644 --- a/q04_select_from_model/build.py +++ b/q04_select_from_model/build.py @@ -1,3 +1,4 @@ +# %load q04_select_from_model/build.py # Default imports from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier @@ -8,3 +9,16 @@ # Your solution code here +def select_from_model(df): + np.random.seed(9) + X=data.iloc[:,:-1] + y=data.iloc[:,-1] + clf = RandomForestClassifier() + clf.fit(X, y) + model = SelectFromModel(clf, prefit=True) + ar=model.get_support() + l=np.where(ar==True)[0].tolist() + feature_name=X.iloc[:,l].columns.tolist() + return feature_name + + diff --git a/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..bc65d63 Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc new file mode 100644 index 0000000..336aea1 Binary files /dev/null and b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..67ebc0a Binary files /dev/null and b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/__pycache__/build.cpython-36.pyc b/q05_forward_selected/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..39193d0 Binary files /dev/null and b/q05_forward_selected/__pycache__/build.cpython-36.pyc differ diff --git a/q05_forward_selected/build.py b/q05_forward_selected/build.py index 8816abd..1518492 100644 --- a/q05_forward_selected/build.py +++ b/q05_forward_selected/build.py @@ -1,10 +1,193 @@ +# %load q05_forward_selected/build.py # Default imports import pandas as pd +import numpy as np from sklearn.linear_model import LinearRegression - +from sklearn.metrics import r2_score data = pd.read_csv('data/house_prices_multivariate.csv') model = LinearRegression() -# Your solution code here +#Your solution code here +def forward_selected(data,model): + X=data.iloc[:,:-1] + y=data.iloc[:,-1] + col_list=X.columns.tolist() + r=[] + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=[] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r21=max(r2) + fe1=d[max(r2)] + r.append(r21) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r21=max(r2) + fe1=d[max(r2)] + r.append(r21) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual','GrLivArea'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r22=max(r2) + fe2=d[max(r2)] + r.append(r22) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual','GrLivArea','BsmtFinSF1'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r23=max(r2) + fe3=d[max(r2)] + r.append(r23) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual','GrLivArea','BsmtFinSF1','GarageCars'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r24=max(r2) + fe4=d[max(r2)] + r.append(r24) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual','GrLivArea','BsmtFinSF1','GarageCars','KitchenAbvGr'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r25=max(r2) + fe5=d[max(r2)] + r.append(r25) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual','GrLivArea','BsmtFinSF1','GarageCars','KitchenAbvGr','1stFlrSF'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r26=max(r2) + fe6=d[max(r2)] + r.append(r26) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual','GrLivArea','BsmtFinSF1','GarageCars','KitchenAbvGr','1stFlrSF','YearRemodAdd'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r27=max(r2) + fe7=d[max(r2)] + r.append(r27) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual','GrLivArea','BsmtFinSF1','GarageCars','KitchenAbvGr','1stFlrSF','YearRemodAdd','LotArea'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r28=max(r2) + fe8=d[max(r2)] + r.append(r28) + + r2=[] + fe=[] + for i in range(len(col_list)): + feature_list=['OverallQual','GrLivArea','BsmtFinSF1','GarageCars','KitchenAbvGr','1stFlrSF','YearRemodAdd','LotArea','MasVnrArea'] + curr_feature_name=col_list[i] + feature_list.append(curr_feature_name) + model=LinearRegression() + model.fit(X.loc[:,feature_list],y) + y_pred=model.predict(X.loc[:,feature_list]) + s=r2_score(y,y_pred) + r2.append(s) + fe.append(feature_list) + d=dict(zip(r2,fe)) + r29=max(r2) + fe9=d[max(r2)] + r.append(r29) + + + + return fe9,r + + diff --git a/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..654b907 Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc new file mode 100644 index 0000000..ab1b97b Binary files /dev/null and b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc differ