diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index c105ea6..26326a2 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_pipeline/__pycache__/__init__.cpython-36.pyc b/q01_pipeline/__pycache__/__init__.cpython-36.pyc index fbce7a1..47de983 100644 Binary files a/q01_pipeline/__pycache__/__init__.cpython-36.pyc and b/q01_pipeline/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_pipeline/__pycache__/build.cpython-36.pyc b/q01_pipeline/__pycache__/build.cpython-36.pyc index 46b8551..cc294d9 100644 Binary files a/q01_pipeline/__pycache__/build.cpython-36.pyc and b/q01_pipeline/__pycache__/build.cpython-36.pyc differ diff --git a/q01_pipeline/build.py b/q01_pipeline/build.py index 96beca7..78c9832 100644 --- a/q01_pipeline/build.py +++ b/q01_pipeline/build.py @@ -1,3 +1,4 @@ +# %load q01_pipeline/build.py import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, GridSearchCV @@ -5,9 +6,58 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.utils.class_weight import compute_class_weight from sklearn.metrics import roc_auc_score - +import seaborn as sns +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import f1_score, confusion_matrix +from sklearn.metrics import precision_score, recall_score +from sklearn.metrics import roc_auc_score +from sklearn.linear_model import LogisticRegression +from imblearn.under_sampling import RandomUnderSampler +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE bank = pd.read_csv('data/Bank_data_to_class.csv', sep=',') +label_enc = LabelEncoder() +for column in bank.select_dtypes(include=['object']).columns.values: + bank[column] = label_enc.fit_transform(bank[column]) +X_train, X_test, y_train, y_test = train_test_split(bank.iloc[:,:-1], + bank.iloc[:,-1], + random_state=9) + +rf = RandomForestClassifier(random_state=9) +lr = LogisticRegression(random_state=9) +model=[rf,lr] # Write your solution here : +def pipeline(X_train, X_test, y_train, y_test,model): + dict1=dict() + dataset=[[X_train, X_test, y_train, y_test]] + # Create the Under samplers + rus = RandomUnderSampler(random_state=9) + X_sample2, y_sample2 = rus.fit_sample(X_train, y_train) + dataset.append([X_sample2, X_test, y_sample2, y_test]) + + + ros = RandomOverSampler(random_state=9) + X_sample3, y_sample3 = ros.fit_sample(X_train, y_train) + dataset.append([X_sample3, X_test, y_sample3, y_test]) + + + smote = SMOTE(random_state=9, kind='borderline2') + X_sample4, y_sample4 = smote.fit_sample(X_train, y_train) + dataset.append([X_sample4, X_test, y_sample4, y_test]) + + roc_old=0 + roc_new=0 + for m in model: + for X_train, X_test, y_train, y_test in dataset: + m.fit(X_train, y_train) + roc_new=roc_auc_score(y_test, m.predict(X_test)) + if(roc_new>=roc_old): + dict1.clear() + dict1[m]=roc_new + roc_old=roc_new + return list(dict1.keys())[0],list(dict1.values())[0] diff --git a/q01_pipeline/tests/__pycache__/__init__.cpython-36.pyc b/q01_pipeline/tests/__pycache__/__init__.cpython-36.pyc index 9a74c78..d989fff 100644 Binary files a/q01_pipeline/tests/__pycache__/__init__.cpython-36.pyc and b/q01_pipeline/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_pipeline/tests/__pycache__/test_q01_pipeline.cpython-36.pyc b/q01_pipeline/tests/__pycache__/test_q01_pipeline.cpython-36.pyc index 1a428fb..a533d87 100644 Binary files a/q01_pipeline/tests/__pycache__/test_q01_pipeline.cpython-36.pyc and b/q01_pipeline/tests/__pycache__/test_q01_pipeline.cpython-36.pyc differ