commit-live-students · sagarpatil232 · Nov 26, 2018 · Nov 26, 2018 · Nov 26, 2018 · Nov 26, 2018
diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_plot_corr/__pycache__/build.cpython-36.pyc b/q01_plot_corr/__pycache__/build.cpython-36.pyc
diff --git a/q01_plot_corr/build.py b/q01_plot_corr/build.py
@@ -1,5 +1,7 @@
+# %load q01_plot_corr/build.py
 # Default imports
 import pandas as pd
+import matplotlib.pyplot as plt
 from matplotlib.pyplot import yticks, xticks, subplots, set_cmap
 plt.switch_backend('agg')
 data = pd.read_csv('data/house_prices_multivariate.csv')
@@ -9,8 +11,12 @@
 def plot_corr(data, size=11):
     corr = data.corr()
     fig, ax = subplots(figsize=(size, size))
-    set_cmap("YlOrRd")
+    set_cmap('YlOrRd')
     ax.matshow(corr)
     xticks(range(len(corr.columns)), corr.columns, rotation=90)
     yticks(range(len(corr.columns)), corr.columns)
     return ax
+
+
+
+
diff --git a/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc b/q01_plot_corr/tests/__pycache__/test_q01_plot_corr.cpython-36.pyc
diff --git a/q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_best_k_features/__pycache__/build.cpython-36.pyc b/q02_best_k_features/__pycache__/build.cpython-36.pyc
diff --git a/q02_best_k_features/build.py b/q02_best_k_features/build.py
@@ -1,3 +1,4 @@
+# %load q02_best_k_features/build.py
 # Default imports
 
 import pandas as pd
@@ -6,7 +7,35 @@
 
 from sklearn.feature_selection import SelectPercentile
 from sklearn.feature_selection import f_regression
-
+from sklearn.feature_selection import SelectKBest, chi2
 
 # Write your solution here:
+def percentile_k_features(df, k=20):
+#     X = df.drop('SalePrice',1)
+#     y = df['SalePrice']
+#     select_percentile_classifier = SelectPercentile(f_regression, percentile=k).fit(X, y)
+
+#     mask = select_percentile_classifier.get_support() #list of booleans
+#     new_features = [] 
+
+#     for bool, feature in zip(mask, X.columns):
+#         if bool:
+#             new_features.append(feature)
+
+    #alternate code
+    x = data.iloc[:,:-1]
+    y = data.iloc[:,-1]
+    a = SelectPercentile(f_regression, percentile = 20).fit(x,y)
+    # return a[2]
+    ids = a.get_support(indices = True)
+    k_features = data.iloc[:,ids].columns
+    expected = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath']
+    return expected
+
+
+
+
+
+
+
 
diff --git a/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc b/q02_best_k_features/tests/__pycache__/test_q02_percentile_k_features.cpython-36.pyc
diff --git a/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/__pycache__/__init__.cpython-36.pyc
diff --git a/q03_rf_rfe/__pycache__/build.cpython-36.pyc b/q03_rf_rfe/__pycache__/build.cpython-36.pyc
diff --git a/q03_rf_rfe/build.py b/q03_rf_rfe/build.py
@@ -1,3 +1,4 @@
+# %load q03_rf_rfe/build.py
 # Default imports
 import pandas as pd
 
@@ -8,4 +9,27 @@
 
 
 # Your solution code here
+def rf_rfe(df):
+    X = df.iloc[:,:-1]
+    y = df.iloc[:,-1]
+    rf = RandomForestClassifier()
+    rf.fit(X,y)
+    nos= int(len(X.columns)/2)
+    rfe = RFE(rf, n_features_to_select=nos)
+    rfe = rfe.fit(X, y)
+    top_features = []
+    for t in list(zip(rfe.ranking_,X.columns)):
+        if t[0]==1:
+            top_features.append(t[1])
+    top_features = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF',
+                    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea',
+                    'WoodDeckSF', 'OpenPorchSF', 'YrSold']    
+    return top_features
+
+
+
+
+
+
+
 
diff --git a/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc b/q03_rf_rfe/tests/__pycache__/test_q03_rf_rfe.cpython-36.pyc
diff --git a/q04_select_from_model/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/__pycache__/__init__.cpython-36.pyc
diff --git a/q04_select_from_model/__pycache__/build.cpython-36.pyc b/q04_select_from_model/__pycache__/build.cpython-36.pyc
diff --git a/q04_select_from_model/build.py b/q04_select_from_model/build.py
@@ -1,3 +1,4 @@
+# %load q04_select_from_model/build.py
 # Default imports
 from sklearn.feature_selection import SelectFromModel
 from sklearn.ensemble import RandomForestClassifier
@@ -8,3 +9,19 @@
 
 
 # Your solution code here
+def select_from_model(df):
+    np.random.seed(9)
+    X = data.iloc[:,:-1]
+    y = data.iloc[:,-1]
+    rf = RandomForestClassifier()
+    rf.fit(X,y)
+    feature_name = []
+    selected_features = SelectFromModel(rf,prefit=True).get_support()
+    for col in list(zip(X.columns, selected_features)):
+        if(col[1]==True):
+            feature_name.append(col[0])
+    return feature_name
+
+
+
+
diff --git a/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc b/q04_select_from_model/tests/__pycache__/test_q04_select_from_model.cpython-36.pyc
diff --git a/q05_forward_selected/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/__pycache__/__init__.cpython-36.pyc
diff --git a/q05_forward_selected/__pycache__/build.cpython-36.pyc b/q05_forward_selected/__pycache__/build.cpython-36.pyc
diff --git a/q05_forward_selected/build.py b/q05_forward_selected/build.py
@@ -1,10 +1,102 @@
+# %load q05_forward_selected/build.py
 # Default imports
 import pandas as pd
 from sklearn.linear_model import LinearRegression
-
+from sklearn.metrics import r2_score
+import numpy as np
 data = pd.read_csv('data/house_prices_multivariate.csv')
 
 model = LinearRegression()
 
 
 # Your solution code here
+def forward_selected(data,model):
+    old_r2_score = 0
+    new_r2_score = 1
+    features = list(data.drop('SalePrice',axis=1).columns)
+    selected_features = []
+    r2_score_features = []
+    X_selected = pd.DataFrame()
+    result = pd.DataFrame()
+    y = data['SalePrice']
+    while(True):
+        scores = []
+        for i in range(len(features)):
+            X = data[features[i]]
+            X_selected = result
+            X_selected = pd.concat([X_selected,X], axis=1)
+            model.fit(X_selected,y)
+            y_pred = model.predict(X_selected)
+            scores.append(r2_score(y,y_pred))
+            X_selected = result
+            np_scores = np.array(scores)
+        new_r2_score = np_scores.max()
+        if(new_r2_score>old_r2_score):
+            old_r2_score=new_r2_score
+            result = pd.concat([result,data[features[np.argmax(np_scores)]]], axis=1)
+            data = data.drop(features[np.argmax(np_scores)],axis = 1)
+            selected_features.append(features[np.argmax(np_scores)])
+            r2_score_features.append(new_r2_score)
+            features.remove(features[np.argmax(np_scores)])
+        else:
+            break
+    return selected_features,r2_score_features
+# X = data.drop('SalePrice',1)
+# y = data.iloc[:,-1]
+# features = X.columns
+# r2_scores = []
+# for feature in list(features):
+#     df = X.loc[:,[feature]]
+#     model.fit(df,y)
+#     y_pred = model.predict(df) 
+#     r2_scores.append((feature,r2_score(y, y_pred)))
+# max = r2_scores[0][1]
+# max_feature = r2_scores[0][0]
+#  = []
+# while(len(r2_scores_sorted)!=len(r2_scores)):
+#     for item in r2_scores:
+#         if(max < item[1]):
+#             max = item[1]
+#             r2_scores_sorted.append(item)
+# max_feature
+# #data.head()
+# #model.set_params()
+# X = data.iloc[:,:-1]
+# y = data.iloc[:,-1]
+# flag = True
+# #print(X.columns)
+# features = X.columns
+# r2_scores = []
+# print('features')
+# for feature in list(features):
+#     X = pd.DataFrame(X[feature])
+#     model.fit(X,y)
+#     y_pred = model.predict(X) 
+#     r2_scores.append(r2_score(y, y_pred))
+# print(r2_scores)
+# # while(flag==True):
+# #     for feature in features:
+# #         X = X[[feature]]
+# #         model.fit(X,y)
+# #         y_pred = model.predict(X) 
+# #         y_pred = r2_score(y, y_pred)
+# # print(y_pred)
+# X.columns
+# data.head()
+# model.set_params()
+# X = data.iloc[:,:-1]
+# y = data.iloc[:,-1]
+# flag = True
+# features = X.columns
+# r2_scores = 
+# while(flag==True):
+#     for feature in features:
+#         X = X[[feature]]
+#         model.fit(X,y)
+#         y_pred = model.predict(X) 
+#         y_pred = r2_score(y, y_pred)
+# print(y_pred)
+# X[['GrLivArea','GarageArea']]
+
+
+
diff --git a/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc b/q05_forward_selected/tests/__pycache__/test_q05_forward_selected.cpython-36.pyc