From c84547a90c0f521afebc8ea43d754729e3ad8e41 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Thu, 16 Sep 2021 22:59:50 +0200 Subject: [PATCH 01/46] OK, at least I can write one pt-rap hist --- train_test_xgboost.py | 123 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/train_test_xgboost.py b/train_test_xgboost.py index 04d3032..5c07364 100644 --- a/train_test_xgboost.py +++ b/train_test_xgboost.py @@ -14,7 +14,9 @@ import gc import matplotlib as mpl +import ROOT +from array import array mpl.rc('figure', max_open_warning = 0) @@ -65,6 +67,56 @@ def __init__(self, bst, dtrain, y_train, dtest, y_test, output_path): self.__train_pred = None self.__test_pred = None + self.root_output_name = 'hists.root' + + self.hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + + + self.hist_out.cd() + + ROOT.gDirectory.mkdir('Signal') + ROOT.gDirectory.mkdir('Background') + + self.hist_out.cd() + self.hist_out.cd('Signal') + + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + self.hist_out.cd() + ROOT.gDirectory.cd('Signal/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('roc') + ROOT.gDirectory.mkdir('hists') + + self.hist_out.cd() + ROOT.gDirectory.cd('Signal/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('roc') + ROOT.gDirectory.mkdir('hists') + + + self.hist_out.cd() + self.hist_out.cd('Background') + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + + self.hist_out.cd() + ROOT.gDirectory.cd('Background/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('roc') + ROOT.gDirectory.mkdir('hists') + + self.hist_out.cd() + ROOT.gDirectory.cd('Background/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('roc') + ROOT.gDirectory.mkdir('hists') + + def apply_predictions(self): self.__bst_train= pd.DataFrame(data=self.bst.predict(self.dtrain, output_margin=False), columns=["xgb_preds"]) @@ -273,6 +325,77 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da fig.savefig(self.output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') + self.hist_out.cd() + ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') + + + rapidity_orig = array('d', df_orig['rapidity'].values.tolist()) + pT_orig = array('d', df_orig['pT'].values.tolist()) + + pT_rap_before_cut = ROOT.TH2D( 'pT_rap_before_ML'+data_name, 'pT_rap_before_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_orig)): + pT_rap_before_cut.Fill(rapidity_orig[i], pT_orig[i]) + + ROOT.gStyle.SetOptStat(0) + + + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_before_cut.Draw('COLZ') + + + + + + rapidity_cut = array('d', df_cut['rapidity'].values.tolist()) + pT_cut = array('d', df_cut['pT'].values.tolist()) + + pT_rap_cut = ROOT.TH2D( 'pT_rap_after_ML_'+data_name, 'pT_rap_after_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_cut)): + pT_rap_cut.Fill(rapidity_cut[i], pT_cut[i]) + + ROOT.gStyle.SetOptStat(0) + + + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_cut.Draw('COLZ') + + + + + + rapidity_diff = array('d', difference['rapidity'].values.tolist()) + pT_diff = array('d', difference['pT'].values.tolist()) + + pT_rap_diff = ROOT.TH2D('pT_rap_diff_'+data_name, 'pT_rap_diff_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_diff)): + pT_rap_diff.Fill(rapidity_diff[i], pT_diff[i]) + + ROOT.gStyle.SetOptStat(0) + + + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_diff.Draw('COLZ') + + + pT_rap_before_cut.Write() + pT_rap_cut.Write() + pT_rap_diff.Write() + + self.hist_out.Close() + + def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): """ From 303acfb3cd317f45f46e32a7be2fe1c073ee1b83 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Thu, 16 Sep 2021 23:16:20 +0200 Subject: [PATCH 02/46] Can write few hists to out file, need to test tomorrow --- train_test_xgboost.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/train_test_xgboost.py b/train_test_xgboost.py index 5c07364..98478dc 100644 --- a/train_test_xgboost.py +++ b/train_test_xgboost.py @@ -69,53 +69,55 @@ def __init__(self, bst, dtrain, y_train, dtest, y_test, output_path): self.root_output_name = 'hists.root' - self.hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - self.hist_out.cd() + hist_out.cd() ROOT.gDirectory.mkdir('Signal') ROOT.gDirectory.mkdir('Background') - self.hist_out.cd() - self.hist_out.cd('Signal') + hist_out.cd() + hist_out.cd('Signal') ROOT.gDirectory.mkdir('train') ROOT.gDirectory.mkdir('test') - self.hist_out.cd() + hist_out.cd() ROOT.gDirectory.cd('Signal/train') ROOT.gDirectory.mkdir('pt_rap') ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') - self.hist_out.cd() + hist_out.cd() ROOT.gDirectory.cd('Signal/test') ROOT.gDirectory.mkdir('pt_rap') ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') - self.hist_out.cd() - self.hist_out.cd('Background') + hist_out.cd() + hist_out.cd('Background') ROOT.gDirectory.mkdir('train') ROOT.gDirectory.mkdir('test') - self.hist_out.cd() + hist_out.cd() ROOT.gDirectory.cd('Background/train') ROOT.gDirectory.mkdir('pt_rap') ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') - self.hist_out.cd() + hist_out.cd() ROOT.gDirectory.cd('Background/test') ROOT.gDirectory.mkdir('pt_rap') ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') + hist_out.Close() + def apply_predictions(self): @@ -325,7 +327,11 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da fig.savefig(self.output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') - self.hist_out.cd() + + + hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + + hist_out.cd() ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') @@ -393,7 +399,7 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da pT_rap_cut.Write() pT_rap_diff.Write() - self.hist_out.Close() + hist_out.Close() From 6f06aaaaac66e23053a174c9905dc0edc134ff75 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 17 Sep 2021 10:01:03 +0200 Subject: [PATCH 03/46] Can plot variables distributions for signal and background, train and test --- train_test_xgboost.py | 88 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/train_test_xgboost.py b/train_test_xgboost.py index 98478dc..9f7f474 100644 --- a/train_test_xgboost.py +++ b/train_test_xgboost.py @@ -419,6 +419,10 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe name of pdf document with distributions """ + hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + hist_out.cd() + + for feature in dfs_orig.columns: fig, ax = plt.subplots(3, figsize=(20, 10)) @@ -494,4 +498,88 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe fig.tight_layout() plt.savefig(pdf_key,format='pdf') + + + + + + dfs_orig_feat = array('d', dfs_orig[feature].values.tolist()) + dfb_orig_feat = array('d', dfb_orig[feature].values.tolist()) + + + dfs_cut_feat = array('d', dfs_cut[feature].values.tolist()) + dfb_cut_feat = array('d', dfb_cut[feature].values.tolist()) + + + dfs_diff_feat = array('d', difference_s[feature].values.tolist()) + + + dfs_orig_root = ROOT.TH1D('signal before ML '+feature, 'signal before ML '+feature, 500, + min(dfs_orig[feature].values.tolist()), max(dfs_orig[feature].values.tolist())) + + for i in range(len(dfs_orig_feat)): + dfs_orig_root.Fill(dfs_orig_feat[i]) + + dfs_orig_root.Draw() + + dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, + min(dfb_orig[feature].values.tolist()), max(dfb_orig[feature].values.tolist())) + + for i in range(len(dfb_orig_feat)): + dfb_orig_root.Fill(dfb_orig_feat[i]) + + dfb_orig_root.Draw() + + + dfs_cut_root = ROOT.TH1D('signal after ML '+feature, 'signal after ML '+feature, 500, + min(dfs_cut[feature].values.tolist()), max(dfs_cut[feature].values.tolist())) + + for i in range(len(dfs_cut_feat)): + dfs_cut_root.Fill(dfs_cut_feat[i]) + + dfs_cut_root.Draw() + + + dfb_cut_root = ROOT.TH1D('background after ML '+feature, 'background after ML '+feature, 500, + min(dfb_cut[feature].values.tolist()), max(dfb_cut[feature].values.tolist())) + + for i in range(len(dfb_cut_feat)): + dfb_cut_root.Fill(dfb_cut_feat[i]) + + dfb_cut_root.Draw() + + + dfs_diff_root = ROOT.TH1D('signal difference '+feature, 'signal difference '+feature, 500, + min(difference_s[feature].values.tolist()), max(difference_s[feature].values.tolist())) + + for i in range(len(dfs_diff_feat)): + dfs_diff_root.Fill(dfs_diff_feat[i]) + + dfs_diff_root.Draw() + + + # dfs_orig_root.Write() + # dfb_orig_root.Write() + # + # dfs_cut_root.Write() + # dfb_cut_root.Write() + # + # dfs_diff_root.Write() + + hist_out.cd() + + hist_out.cd('Signal'+'/'+sample+'/'+'hists') + dfs_orig_root.Write() + dfs_cut_root.Write() + dfs_diff_root.Write() + + hist_out.cd() + + hist_out.cd('Background'+'/'+sample+'/'+'hists') + + dfb_orig_root.Write() + dfb_cut_root.Write() + + hist_out.Close() + pdf_key.close() From cef708df786b5f723d64690ba4aa87a1e2857317 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 17 Sep 2021 11:26:10 +0200 Subject: [PATCH 04/46] Plotted roc curve as root object --- helper.py | 10 ++++++- train_test_xgboost.py | 62 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/helper.py b/helper.py index a3dfc99..91abec5 100644 --- a/helper.py +++ b/helper.py @@ -104,7 +104,15 @@ def AMS(y_true, y_predict, y_true1, y_predict1, output_path): fig.tight_layout() fig.savefig(str(output_path)+'/hists.png') plt.show() - return S0_best_threshold, S0_best_threshold1 + + roc_curve_data = dict() + roc_curve_data["fpr_train"] = fpr + roc_curve_data["tpr_train"] = tpr + + roc_curve_data["fpr_test"] = fpr1 + roc_curve_data["tpr_test"] = tpr1 + + return S0_best_threshold, S0_best_threshold1, roc_curve_data def plot_confusion_matrix(cm, classes, diff --git a/train_test_xgboost.py b/train_test_xgboost.py index 9f7f474..1857af5 100644 --- a/train_test_xgboost.py +++ b/train_test_xgboost.py @@ -87,13 +87,13 @@ def __init__(self, bst, dtrain, y_train, dtest, y_test, output_path): hist_out.cd() ROOT.gDirectory.cd('Signal/train') ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('roc') + # ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') hist_out.cd() ROOT.gDirectory.cd('Signal/test') ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('roc') + # ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') @@ -107,13 +107,13 @@ def __init__(self, bst, dtrain, y_train, dtest, y_test, output_path): hist_out.cd() ROOT.gDirectory.cd('Background/train') ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('roc') + # ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') hist_out.cd() ROOT.gDirectory.cd('Background/test') ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('roc') + # ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') hist_out.Close() @@ -132,7 +132,51 @@ def apply_predictions(self): def get_threshold(self, train_y, test_y): - self.__train_best_thr, self.__test_best_thr = AMS(train_y, self.__bst_train['xgb_preds'], test_y, self.__bst_test['xgb_preds'], self.output_path) + self.__train_best_thr, self.__test_best_thr, roc_curve_data = AMS(train_y, + self.__bst_train['xgb_preds'], test_y, self.__bst_test['xgb_preds'], + self.output_path) + + hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + + hist_out.cd() + + fpr = roc_curve_data['fpr_train'] + tpr = roc_curve_data['tpr_train'] + + + fpr1 = roc_curve_data['fpr_test'] + tpr1 = roc_curve_data['tpr_test'] + + + fpr_d_tr = array('d', fpr.tolist()) + tpr_d_tr = array('d', tpr.tolist()) + + fpr_d_ts = array('d', fpr1.tolist()) + tpr_d_ts = array('d', tpr1.tolist()) + + + train_roc = ROOT.TGraph(len(fpr_d_tr), fpr_d_tr, tpr_d_tr) + test_roc = ROOT.TGraph(len(fpr_d_ts), fpr_d_ts, tpr_d_ts) + + train_roc.SetLineColor(ROOT.kRed + 2) + test_roc.SetLineColor(ROOT.kBlue + 2) + + + train_roc.SetLineWidth(3) + test_roc.SetLineWidth(3) + + + train_roc.SetLineStyle(9) + test_roc.SetLineStyle(9) + + train_roc.SetTitle("Receiver operating characteristic") + test_roc.SetTitle("Receiver operating characteristic") + + train_roc.Write() + test_roc.Write() + + hist_out.Close() + return self.__train_best_thr, self.__test_best_thr @@ -558,14 +602,6 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe dfs_diff_root.Draw() - # dfs_orig_root.Write() - # dfb_orig_root.Write() - # - # dfs_cut_root.Write() - # dfb_cut_root.Write() - # - # dfs_diff_root.Write() - hist_out.cd() hist_out.cd('Signal'+'/'+sample+'/'+'hists') From 6e02ce384754173b7a6856f3f53b7ed0f1bbb629 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 17 Sep 2021 11:31:29 +0200 Subject: [PATCH 05/46] Set graph titles --- train_test_xgboost.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/train_test_xgboost.py b/train_test_xgboost.py index 1857af5..0dc7e88 100644 --- a/train_test_xgboost.py +++ b/train_test_xgboost.py @@ -169,8 +169,10 @@ def get_threshold(self, train_y, test_y): train_roc.SetLineStyle(9) test_roc.SetLineStyle(9) - train_roc.SetTitle("Receiver operating characteristic") - test_roc.SetTitle("Receiver operating characteristic") + + + train_roc.SetTitle("Receiver operating characteristic train") + test_roc.SetTitle("Receiver operating characteristic train") train_roc.Write() test_roc.Write() From f4d329317a158510ee7783ea07c938eb0457f442 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 17 Sep 2021 11:41:20 +0200 Subject: [PATCH 06/46] Corrected train roc-curve's title --- train_test_xgboost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_test_xgboost.py b/train_test_xgboost.py index 0dc7e88..8967009 100644 --- a/train_test_xgboost.py +++ b/train_test_xgboost.py @@ -172,7 +172,7 @@ def get_threshold(self, train_y, test_y): train_roc.SetTitle("Receiver operating characteristic train") - test_roc.SetTitle("Receiver operating characteristic train") + test_roc.SetTitle("Receiver operating characteristic test") train_roc.Write() test_roc.Write() From b4f1162234bfff02c9abca7c8bf315abd13db87c Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 17 Sep 2021 12:49:00 +0200 Subject: [PATCH 07/46] Correlation matrix using hipe4ml (but need to check carefully order) --- MLconfig_variables.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MLconfig_variables.py b/MLconfig_variables.py index bc013b8..4a11d20 100644 --- a/MLconfig_variables.py +++ b/MLconfig_variables.py @@ -6,6 +6,15 @@ from scipy.stats import binned_statistic as b_s import matplotlib as mpl +from hipe4ml import plot_utils + +def correlation_matrix(bgr, sign, vars_to_draw, leg_labels, output_path): + res_s_b = plot_utils.plot_corr([bgr, sign], vars_to_draw, leg_labels) + res_s_b[0].savefig(output_path+'/'+'corr_matrix_bgr.png') + res_s_b[1].savefig(output_path+'/'+'corr_matrix_sign.png') + + + def calculate_correlation(df, vars_to_corr, target_var) : """ Calculates correlations with target variable variable and standart errors From aae6b0cb2924991b0b9edebc6f49e58a0f4f0d59 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 4 Oct 2021 14:30:05 +0200 Subject: [PATCH 08/46] Changed label's font size --- MLconfig_variables.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/MLconfig_variables.py b/MLconfig_variables.py index 4a11d20..765bf66 100644 --- a/MLconfig_variables.py +++ b/MLconfig_variables.py @@ -12,7 +12,7 @@ def correlation_matrix(bgr, sign, vars_to_draw, leg_labels, output_path): res_s_b = plot_utils.plot_corr([bgr, sign], vars_to_draw, leg_labels) res_s_b[0].savefig(output_path+'/'+'corr_matrix_bgr.png') res_s_b[1].savefig(output_path+'/'+'corr_matrix_sign.png') - + def calculate_correlation(df, vars_to_corr, target_var) : @@ -72,11 +72,12 @@ def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='') plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='') ax.grid(zorder=0) - ax.set_xticklabels(vars_to_draw, fontsize=25, rotation =70) - ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) + ax.set_xticklabels(vars_to_draw, fontsize=30, rotation =70) + # ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) + ax.yaxis.set_tick_params(labelsize=30) plt.legend(('signal','background'), fontsize = 25) - plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 25) - plt.ylabel('Correlation coefficient', fontsize = 25) + plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 30) + plt.ylabel('Correlation coefficient', fontsize = 30) fig.tight_layout() fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png') @@ -155,7 +156,8 @@ def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-') - + axs.xaxis.set_tick_params(labelsize=20) + axs.yaxis.set_tick_params(labelsize=20) fig.tight_layout() plt.savefig(pdf_key,format='pdf') @@ -257,6 +259,11 @@ def plot2D_mass(df, sample, mass_var, mass_range, sgn, peak, pdf_key): mpl.pyplot.colorbar() + + axs.xaxis.set_tick_params(labelsize=20) + axs.yaxis.set_tick_params(labelsize=20) + + plt.legend(shadow=True,title =str(len(df))+ " samples") fig.tight_layout() From 6a45ae5edbce22f26b7dda54463e2bb8ef1045f7 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 4 Oct 2021 14:37:12 +0200 Subject: [PATCH 09/46] Set lablel title for output file --- train_test_xgboost.py | 101 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/train_test_xgboost.py b/train_test_xgboost.py index 8967009..74685aa 100644 --- a/train_test_xgboost.py +++ b/train_test_xgboost.py @@ -87,13 +87,11 @@ def __init__(self, bst, dtrain, y_train, dtest, y_test, output_path): hist_out.cd() ROOT.gDirectory.cd('Signal/train') ROOT.gDirectory.mkdir('pt_rap') - # ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') hist_out.cd() ROOT.gDirectory.cd('Signal/test') ROOT.gDirectory.mkdir('pt_rap') - # ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') @@ -107,13 +105,11 @@ def __init__(self, bst, dtrain, y_train, dtest, y_test, output_path): hist_out.cd() ROOT.gDirectory.cd('Background/train') ROOT.gDirectory.mkdir('pt_rap') - # ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') hist_out.cd() ROOT.gDirectory.cd('Background/test') ROOT.gDirectory.mkdir('pt_rap') - # ROOT.gDirectory.mkdir('roc') ROOT.gDirectory.mkdir('hists') hist_out.Close() @@ -174,6 +170,12 @@ def get_threshold(self, train_y, test_y): train_roc.SetTitle("Receiver operating characteristic train") test_roc.SetTitle("Receiver operating characteristic test") + train_roc.GetXaxis().SetTitle('FPR'); + train_roc.GetYaxis().SetTitle('TPR'); + + test_roc.GetXaxis().SetTitle('FPR'); + test_roc.GetYaxis().SetTitle('TPR'); + train_roc.Write() test_roc.Write() @@ -291,6 +293,69 @@ def preds_prob(self, preds, true, dataset): fig.savefig(str(self.output_path)+'/test_best_pred.png') + + # def preds_prob1(self, preds,true, preds1, true1): + # fig, ax = plt.subplots(figsize=(12, 8)) + # bins1=100 + # TP = self.__bst_train[(self.__bst_train[true]==1)] + # TN = self.__bst_train[(self.__bst_train[true]==0)] + # + # plt.hist(TN[preds], density=True, bins=bins1,facecolor='red',alpha = 0.3, label='background in train') + # plt.hist(TP[preds], density=True, bins=bins1,facecolor='blue',alpha = 0.3, label='signal in train') + # + # + # TP1 = self.__bst_test[(self.__bst_test[true]==1)] + # TN1 = self.__bst_test[(self.__bst_test[true]==0)] + # + # scale = np.histogram(TN[preds], density = True, bins=bins1)[0] / np.histogram(TN[preds], density = False, bins=bins1)[0] + # + # scale1 = np.histogram(TP[preds], density = True, bins=bins1)[0] / np.histogram(TP[preds], density = False, bins=bins1)[0] + # + # hist, bins_train = np.histogram(TN[preds], density = False, bins=bins1) + # err = np.sqrt(hist) + # center = (bins_train[:-1] + bins_train[1:]) / 2 + # plt.errorbar(center, hist * scale, yerr=err*scale, fmt='o', + # c='red', label='signal in test') + # + # # plt.errorbar(center, hist, fmt='o', + # # c='red', label='signal in test') + # + # hist1, bins_test = np.histogram(TP1[preds1], density = False, bins=bins1) + # err1 = np.sqrt(hist1) + # center1 = (bins_test[:-1] + bins_test[1:]) / 2 + # plt.errorbar(center1, hist1 , yerr=err1, fmt='o', + # c='blue', label='background in test') + # + # # plt.errorbar(center, hist,fmt='o', + # # c='blue', label='background in test') + # + # #ax.annotate('cut on probability', xy=(0, 90), xycoords='data',xytext=(0.25,0.5), textcoords='axes fraction', + # #fontsize=15,arrowprops=dict(facecolor='black', shrink=0.05),horizontalalignment='right', verticalalignment='top') + # + # # if df[true].unique().shape[0]>2: + # # TP2= df[df[true]>1] + # # plt.hist(TP2[preds], bins=bins1,facecolor='green',alpha = 0.3, label='secondaries in train') + # # TP2= df1[df1[true1]>1] + # # hist2, bins2 = np.histogram(TP2[preds1], bins=bins1) + # # center2 = (bins2[:-1] + bins2[1:]) / 2 + # # err2 = np.sqrt(hist2) + # # plt.errorbar(center2, hist2,yerr=err2, fmt='o',c='green',label='secondaries in test') + # + # + # ax.set_yscale('log') + # ax.set_xlabel('Probability',fontsize=20) + # + # ax.xaxis.set_tick_params(labelsize=15) + # ax.yaxis.set_tick_params(labelsize=15) + # + # plt.ylabel('Counts', fontsize=20) + # #ax.set_xticks(np.arange(0,1.1,0.1)) + # plt.legend(fontsize=18) + # plt.show() + # fig.tight_layout() + # fig.savefig(self.output_path+'/'+'Lambda_XGB_prediction_0.jpg') + + def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) @@ -308,12 +373,13 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da axs[2].set_aspect(aspect = 'auto') rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) + saved = round((df_cut.shape[0] / df_orig.shape[0]) * 100, 5) diff = df_orig.shape[0] - df_cut.shape[0] axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14) axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14) - axs[2].legend(shadow=True, title ='ML cut rejects \n'+ str(rej) +'% of '+ s_label + - '\n ' + str(diff)+ ' samples were rejected ', - fontsize =14) + axs[2].legend(shadow=True, title ='ML cut saves \n'+ str(saved) +'% of '+ s_label, fontsize =14) + + counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100, norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) @@ -396,6 +462,9 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da ROOT.gStyle.SetPalette(ROOT.kBird) + pT_rap_before_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_before_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_before_cut.Draw('COLZ') @@ -417,6 +486,9 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da ROOT.gStyle.SetPalette(ROOT.kBird) + pT_rap_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_cut.Draw('COLZ') @@ -438,6 +510,9 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da ROOT.gStyle.SetPalette(ROOT.kBird) + pT_rap_diff.GetXaxis().SetTitle('rapidity'); + pT_rap_diff.GetYaxis().SetTitle('pT, GeV'); + pT_rap_diff.Draw('COLZ') @@ -490,7 +565,7 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe ax[0].xaxis.set_tick_params(labelsize=15) ax[0].yaxis.set_tick_params(labelsize=15) - ax[0].set_title(str(feature) + ' MC '+ sample, fontsize = 25) + ax[0].set_title(str(feature) + ' MC '+ sample + ' before ML cut', fontsize = 25) ax[0].set_xlabel(feature, fontsize = 25) if feature!=mass_var: @@ -513,7 +588,7 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe ax[1].xaxis.set_tick_params(labelsize=15) ax[1].yaxis.set_tick_params(labelsize=15) - ax[1].set_title(feature + ' MC '+ sample, fontsize = 25) + ax[1].set_title(feature + ' MC '+ sample+ ' after ML cut', fontsize = 25) ax[1].set_xlabel(feature, fontsize = 25) if feature!='mass': @@ -535,7 +610,7 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe ax[2].xaxis.set_tick_params(labelsize=15) ax[2].yaxis.set_tick_params(labelsize=15) - ax[2].set_title(feature + ' MC '+ sample, fontsize = 25) + ax[2].set_title(feature + ' MC '+ sample +' signal difference', fontsize = 25) ax[2].set_xlabel(feature, fontsize = 25) if feature!=mass_var: @@ -566,6 +641,8 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe for i in range(len(dfs_orig_feat)): dfs_orig_root.Fill(dfs_orig_feat[i]) + dfs_orig_root.GetXaxis().SetTitle(feature); + dfs_orig_root.Draw() dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, @@ -574,6 +651,7 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe for i in range(len(dfb_orig_feat)): dfb_orig_root.Fill(dfb_orig_feat[i]) + dfb_orig_root.GetXaxis().SetTitle(feature); dfb_orig_root.Draw() @@ -583,6 +661,7 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe for i in range(len(dfs_cut_feat)): dfs_cut_root.Fill(dfs_cut_feat[i]) + dfs_cut_root.GetXaxis().SetTitle(feature); dfs_cut_root.Draw() @@ -592,6 +671,7 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe for i in range(len(dfb_cut_feat)): dfb_cut_root.Fill(dfb_cut_feat[i]) + dfb_cut_root.GetXaxis().SetTitle(feature); dfb_cut_root.Draw() @@ -601,6 +681,7 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe for i in range(len(dfs_diff_feat)): dfs_diff_root.Fill(dfs_diff_feat[i]) + dfs_diff_root.GetXaxis().SetTitle(feature); dfs_diff_root.Draw() From 896f853c16be40665adbdb632813982ed909e2e5 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Wed, 13 Oct 2021 12:27:50 +0200 Subject: [PATCH 10/46] Hipe4ML based bayesian optimization, train, test and model saving --- hipe_conf_params.py | 82 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 hipe_conf_params.py diff --git a/hipe_conf_params.py b/hipe_conf_params.py new file mode 100644 index 0000000..dc78b4d --- /dev/null +++ b/hipe_conf_params.py @@ -0,0 +1,82 @@ +from hipe4ml.model_handler import ModelHandler +from dataclasses import dataclass +import xgboost as xgb + +import matplotlib.pyplot as plt +from hipe4ml import plot_utils + +import xgboost as xgb +import treelite + + +@dataclass +class XGBmodel(): + features_for_train: list + hyper_pars_ranges: dict + train_test_data: list + output_path : str + model_hdl: ModelHandler = (None, None, None) + metrics: str = 'roc_auc' + nfold: int = 3 + init_points: int = 1 + n_iter: int = 2 + n_jobs: int = -1 + + + + + def modelBO(self): + model_clf = xgb.XGBClassifier() + self.model_hdl = ModelHandler(model_clf, self.features_for_train) + self.model_hdl.optimize_params_bayes(self.train_test_data, self.hyper_pars_ranges, + self.metrics, self.nfold, self.init_points, self.n_iter, self.n_jobs) + + + + def train_test_pred(self): + self.model_hdl.train_test_model(self.train_test_data) + + y_pred_train = self.model_hdl.predict(self.train_test_data[0], False) + y_pred_test = self.model_hdl.predict(self.train_test_data[2], False) + + + return y_pred_train, y_pred_test + + + def save_predictions(self, filename): + print(self.model_hdl.get_original_model()) + self.model_hdl.dump_original_model(self.output_path+'/'+filename, xgb_format=False) + + + def load_model(self, filename): + self.model_hdl.load_model_handler(filename) + + + + def save_model_lib(self): + bst = self.model_hdl.model.get_booster() + + #create an object out of your model, bst in our case + model = treelite.Model.from_xgboost(bst) + #use GCC compiler + toolchain = 'gcc' + #parallel_comp can be changed upto as many processors as one have + model.export_lib(toolchain=toolchain, libpath=self.output_path+'/xgb_model.so', + params={'parallel_comp': 4}, verbose=True) + + + # Operating system of the target machine + platform = 'unix' + model.export_srcpkg(platform=platform, toolchain=toolchain, + pkgpath=self.output_path+'/XGBmodel.zip', libname='xgb_model.so', + verbose=True) + + + + def plot_dists(self): + + leg_labels = ['background', 'signal'] + ml_out_fig = plot_utils.plot_output_train_test(self.model_hdl, self.train_test_data, 100, + False, leg_labels, True, density=True) + + plt.savefig(str(self.output_path)+'/thresholds.png') From 9a13d689f47ffb89aae7874cf500aa9602ab12f4 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 15 Oct 2021 15:44:10 +0200 Subject: [PATCH 11/46] Applied train and test predictions, got distributions and estimated model --- apply_model.py | 324 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 apply_model.py diff --git a/apply_model.py b/apply_model.py new file mode 100644 index 0000000..421a401 --- /dev/null +++ b/apply_model.py @@ -0,0 +1,324 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import xgboost as xgb +from hipe4ml.plot_utils import plot_roc_train_test +from helper import * +from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score + +from dataclasses import dataclass + +from matplotlib.font_manager import FontProperties + +from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, + AutoMinorLocator) + +import gc +import matplotlib as mpl +import ROOT + +from array import array + + +mpl.rc('figure', max_open_warning = 0) + + +@dataclass +class ApplyXGB: + + y_pred_train : np.ndarray + y_pred_test : np.ndarray + + y_train : np.ndarray + y_test : np.ndarray + + output_path : str + + __bst_train : pd = pd.DataFrame() + __bst_test : pd = pd.DataFrame() + + __train_best_thr : int = 0 + __test_best_thr : int = 0 + + + + # root_output_name : str = 'hists.root' + # + # __hist_out : ROOT.TFile(output_path+'/'+root_output_name, "UPDATE") + + + + + def apply_predictions(self): + + self.__bst_train["xgb_preds"] = self.y_pred_train + self.__bst_train['issignal'] = self.y_train + + + self.__bst_test["xgb_preds"] = self.y_pred_test + self.__bst_test['issignal'] = self.y_test + + return self.__bst_train, self.__bst_test + + + def get_threshold(self): + + self.__train_best_thr, self.__test_best_thr, roc_curve_data = AMS(self.y_train, + self.__bst_train['xgb_preds'], self.y_test, self.__bst_test['xgb_preds'], + self.output_path) + + + return self.__train_best_thr, self.__test_best_thr + + + def apply_threshold(self): + cut_train = self.__train_best_thr + self.__train_pred = ((self.__bst_train['xgb_preds']>cut_train)*1) + + cut_test = self.__test_best_thr + self.__test_pred = ((self.__bst_test['xgb_preds']>cut_test)*1) + + return self.__train_pred, self.__test_pred + + + def get_result(self, x_train, x_test): + + train_with_preds = x_train.copy() + train_with_preds['xgb_preds1'] = self.__train_pred.values + + + test_with_preds = x_test.copy() + test_with_preds['xgb_preds1'] = self.__test_pred.values + + return train_with_preds, test_with_preds + + + def features_importance(self, bst): + # this one needs to be tested + ax = xgb.plot_importance(bst) + plt.rcParams['figure.figsize'] = [6, 3] + ax.figure.tight_layout() + ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") + + + def CM_plot_train_test(self): + """ + Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to + the number of observations known to be in group i and predicted to be in + group j. Thus in binary classification, the count of true positives is C00, + false negatives C01,false positives is C10, and true neagtives is C11. + + Confusion matrix is applied to previously unseen by model data, so we can + estimate model's performance + + Parameters + ---------- + test_best: numpy.float32 + best threshold + + x_train: dataframe + we want to get confusion matrix on training datasets + """ + #lets take the best threshold and look at the confusion matrix + + cnf_matrix_train = confusion_matrix(self.__bst_train['issignal'], self.__train_pred, labels=[1,0]) + np.set_printoptions(precision=2) + fig_train, axs_train = plt.subplots(figsize=(10, 8)) + axs_train.yaxis.set_label_coords(-0.04,.5) + axs_train.xaxis.set_label_coords(0.5,-.005) + plot_confusion_matrix(cnf_matrix_train, classes=['signal','background'], + title=' Train Dataset Confusion Matrix for XGB for cut > '+str(self.__train_best_thr)) + plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_train.png') + + + cnf_matrix_test = confusion_matrix(self.__bst_test['issignal'], self.__test_pred, labels=[1,0]) + np.set_printoptions(precision=2) + fig_test, axs_test = plt.subplots(figsize=(10, 8)) + axs_test.yaxis.set_label_coords(-0.04,.5) + axs_test.xaxis.set_label_coords(0.5,-.005) + plot_confusion_matrix(cnf_matrix_test, classes=['signal','background'], + title=' Test Dataset Confusion Matrix for XGB for cut > '+str(self.__test_best_thr)) + plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') + + + + def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): + fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) + + + if sign ==0: + s_label = 'Background' + m = 5 + + if sign==1: + s_label = 'Signal' + m = 1 + + axs[0].set_aspect(aspect = 'auto') + axs[1].set_aspect(aspect = 'auto') + axs[2].set_aspect(aspect = 'auto') + + rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) + saved = round((df_cut.shape[0] / df_orig.shape[0]) * 100, 5) + diff = df_orig.shape[0] - df_cut.shape[0] + axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14) + axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14) + axs[2].legend(shadow=True, title ='ML cut saves \n'+ str(saved) +'% of '+ s_label, fontsize =14) + + + + counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16) + axs[0].set_xlabel('rapidity', fontsize=15) + axs[0].set_ylabel('pT, GeV', fontsize=15) + + + mpl.pyplot.colorbar(im0, ax = axs[0]) + + + + axs[0].xaxis.set_major_locator(MultipleLocator(1)) + axs[0].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[0].xaxis.set_tick_params(which='both', width=2) + + + fig.tight_layout() + + + counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut['rapidity'], df_cut['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16) + axs[1].set_xlabel('rapidity', fontsize=15) + axs[1].set_ylabel('pT, GeV', fontsize=15) + + mpl.pyplot.colorbar(im1, ax = axs[1]) + + axs[1].xaxis.set_major_locator(MultipleLocator(1)) + axs[1].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[1].xaxis.set_tick_params(which='both', width=2) + + fig.tight_layout() + + + counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[2].set_title(s_label + ' difference ', fontsize = 16) + axs[2].set_xlabel('rapidity', fontsize=15) + axs[2].set_ylabel('pT, GeV', fontsize=15) + + mpl.pyplot.colorbar(im1, ax = axs[2]) + + + axs[2].xaxis.set_major_locator(MultipleLocator(1)) + axs[2].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[2].xaxis.set_tick_params(which='both', width=2) + + fig.tight_layout() + + fig.savefig(self.output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') + + + + def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): + """ + Applied quality cuts and created distributions for all the features in pdf + file + Parameters + ---------- + df_s: dataframe + signal + df_b: dataframe + background + feature: str + name of the feature to be plotted + pdf_key: PdfPages object + name of pdf document with distributions + """ + + # hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + # hist_out.cd() + + + for feature in dfs_orig.columns: + fig, ax = plt.subplots(3, figsize=(20, 10)) + + + fontP = FontProperties() + fontP.set_size('xx-large') + + ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') + ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + + + '\n S samples: '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0]) + + '\nquality cuts ', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[0].xaxis.set_tick_params(labelsize=15) + ax[0].yaxis.set_tick_params(labelsize=15) + + ax[0].set_title(str(feature) + ' MC '+ sample + ' before ML cut', fontsize = 25) + ax[0].set_xlabel(feature, fontsize = 25) + + if feature!=mass_var: + ax[0].set_yscale('log') + + fig.tight_layout() + + + ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') + ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + + '\n S samples: '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) + + '\nquality cuts + ML cut', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + + ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[1].xaxis.set_tick_params(labelsize=15) + ax[1].yaxis.set_tick_params(labelsize=15) + + ax[1].set_title(feature + ' MC '+ sample+ ' after ML cut', fontsize = 25) + ax[1].set_xlabel(feature, fontsize = 25) + + if feature!='mass': + ax[1].set_yscale('log') + + fig.tight_layout() + + + + + ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[2].legend(shadow=True,title ='S samples: '+str(len(difference_s)) +'\nsignal difference', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + + ax[2].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[2].xaxis.set_tick_params(labelsize=15) + ax[2].yaxis.set_tick_params(labelsize=15) + + ax[2].set_title(feature + ' MC '+ sample +' signal difference', fontsize = 25) + ax[2].set_xlabel(feature, fontsize = 25) + + if feature!=mass_var: + ax[2].set_yscale('log') + + fig.tight_layout() + + plt.savefig(pdf_key,format='pdf') + pdf_key.close() From 8cdda43ae8ab777811a6ad8d3e27a0634aadbe4b Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 15 Oct 2021 16:11:52 +0200 Subject: [PATCH 12/46] Model is private variable --- hipe_conf_params.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/hipe_conf_params.py b/hipe_conf_params.py index dc78b4d..4f802b0 100644 --- a/hipe_conf_params.py +++ b/hipe_conf_params.py @@ -15,7 +15,7 @@ class XGBmodel(): hyper_pars_ranges: dict train_test_data: list output_path : str - model_hdl: ModelHandler = (None, None, None) + __model_hdl: ModelHandler = (None, None, None) metrics: str = 'roc_auc' nfold: int = 3 init_points: int = 1 @@ -27,34 +27,34 @@ class XGBmodel(): def modelBO(self): model_clf = xgb.XGBClassifier() - self.model_hdl = ModelHandler(model_clf, self.features_for_train) - self.model_hdl.optimize_params_bayes(self.train_test_data, self.hyper_pars_ranges, + self.__model_hdl = ModelHandler(model_clf, self.features_for_train) + self.__model_hdl.optimize_params_bayes(self.train_test_data, self.hyper_pars_ranges, self.metrics, self.nfold, self.init_points, self.n_iter, self.n_jobs) def train_test_pred(self): - self.model_hdl.train_test_model(self.train_test_data) + self.__model_hdl.train_test_model(self.train_test_data) - y_pred_train = self.model_hdl.predict(self.train_test_data[0], False) - y_pred_test = self.model_hdl.predict(self.train_test_data[2], False) + y_pred_train = self.__model_hdl.predict(self.train_test_data[0], False) + y_pred_test = self.__model_hdl.predict(self.train_test_data[2], False) return y_pred_train, y_pred_test def save_predictions(self, filename): - print(self.model_hdl.get_original_model()) - self.model_hdl.dump_original_model(self.output_path+'/'+filename, xgb_format=False) + print(self.__model_hdl.get_original_model()) + self.__model_hdl.dump_original_model(self.output_path+'/'+filename, xgb_format=False) def load_model(self, filename): - self.model_hdl.load_model_handler(filename) - + self.__model_hdl.load_model_handler(filename) + # add this part to apply preds def save_model_lib(self): - bst = self.model_hdl.model.get_booster() + bst = self.__model_hdl.model.get_booster() #create an object out of your model, bst in our case model = treelite.Model.from_xgboost(bst) @@ -76,7 +76,7 @@ def save_model_lib(self): def plot_dists(self): leg_labels = ['background', 'signal'] - ml_out_fig = plot_utils.plot_output_train_test(self.model_hdl, self.train_test_data, 100, + ml_out_fig = plot_utils.plot_output_train_test(self.__model_hdl, self.train_test_data, 100, False, leg_labels, True, density=True) plt.savefig(str(self.output_path)+'/thresholds.png') From ea16a12bec1f5467e9311b7502ff879f1cf975d5 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 18 Oct 2021 10:19:49 +0200 Subject: [PATCH 13/46] Save model as treelite library --- helper.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/helper.py b/helper.py index 91abec5..1f9e936 100644 --- a/helper.py +++ b/helper.py @@ -8,7 +8,7 @@ from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score from numpy import sqrt, log, argmax import itertools - +import treelite def transform_df_to_log(df, vars, inp_file): @@ -198,3 +198,23 @@ def diff_SB_cut(df, target_label): dfb_cut = df[(df['xgb_preds1']==1) & (df[target_label]==0)] return dfs_cut, dfb_cut + + +# add this part to apply preds +def save_model_lib(bst_model, output_path): + bst = bst_model.get_booster() + + #create an object out of your model, bst in our case + model = treelite.Model.from_xgboost(bst) + #use GCC compiler + toolchain = 'gcc' + #parallel_comp can be changed upto as many processors as one have + model.export_lib(toolchain=toolchain, libpath=output_path+'/xgb_model.so', + params={'parallel_comp': 4}, verbose=True) + + + # Operating system of the target machine + platform = 'unix' + model.export_srcpkg(platform=platform, toolchain=toolchain, + pkgpath=output_path+'/XGBmodel.zip', libname='xgb_model.so', + verbose=True) From 9cc895214ca2695a7797ec0550a7389cff53d85d Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 18 Oct 2021 10:22:15 +0200 Subject: [PATCH 14/46] Added getter to return a model --- hipe_conf_params.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/hipe_conf_params.py b/hipe_conf_params.py index 4f802b0..3884f03 100644 --- a/hipe_conf_params.py +++ b/hipe_conf_params.py @@ -6,7 +6,6 @@ from hipe4ml import plot_utils import xgboost as xgb -import treelite @dataclass @@ -52,25 +51,8 @@ def load_model(self, filename): self.__model_hdl.load_model_handler(filename) - # add this part to apply preds - def save_model_lib(self): - bst = self.__model_hdl.model.get_booster() - - #create an object out of your model, bst in our case - model = treelite.Model.from_xgboost(bst) - #use GCC compiler - toolchain = 'gcc' - #parallel_comp can be changed upto as many processors as one have - model.export_lib(toolchain=toolchain, libpath=self.output_path+'/xgb_model.so', - params={'parallel_comp': 4}, verbose=True) - - - # Operating system of the target machine - platform = 'unix' - model.export_srcpkg(platform=platform, toolchain=toolchain, - pkgpath=self.output_path+'/XGBmodel.zip', libname='xgb_model.so', - verbose=True) - + def get_mode_booster(self): + return self.__model_hdl.model def plot_dists(self): From c854ccb1ff9762f48bf33892493b9e52aa2e7068 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 18 Oct 2021 16:03:51 +0200 Subject: [PATCH 15/46] Deleted a comment --- helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helper.py b/helper.py index 1f9e936..02da4e0 100644 --- a/helper.py +++ b/helper.py @@ -200,7 +200,7 @@ def diff_SB_cut(df, target_label): return dfs_cut, dfb_cut -# add this part to apply preds + def save_model_lib(bst_model, output_path): bst = bst_model.get_booster() From 424fa9f90deea2cdf4c55fe25e18b20693f40c0c Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 18 Oct 2021 16:07:05 +0200 Subject: [PATCH 16/46] Distributions to root file --- apply_model.py | 246 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 241 insertions(+), 5 deletions(-) diff --git a/apply_model.py b/apply_model.py index 421a401..683d564 100644 --- a/apply_model.py +++ b/apply_model.py @@ -34,6 +34,8 @@ class ApplyXGB: output_path : str + root_output_name = 'hists.root' + __bst_train : pd = pd.DataFrame() __bst_test : pd = pd.DataFrame() @@ -41,12 +43,50 @@ class ApplyXGB: __test_best_thr : int = 0 + def __post_init__(self): + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE") + __hist_out.cd() + + ROOT.gDirectory.mkdir('Signal') + ROOT.gDirectory.mkdir('Background') + + __hist_out.cd() + __hist_out.cd('Signal') + + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + __hist_out.cd() + ROOT.gDirectory.cd('Signal/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.cd() + ROOT.gDirectory.cd('Signal/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + + __hist_out.cd() + __hist_out.cd('Background') + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + - # root_output_name : str = 'hists.root' - # - # __hist_out : ROOT.TFile(output_path+'/'+root_output_name, "UPDATE") + __hist_out.cd() + ROOT.gDirectory.cd('Background/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + __hist_out.cd() + ROOT.gDirectory.cd('Background/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + __hist_out.Close() def apply_predictions(self): @@ -67,6 +107,53 @@ def get_threshold(self): self.__bst_train['xgb_preds'], self.y_test, self.__bst_test['xgb_preds'], self.output_path) + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + + __hist_out.cd() + + fpr = roc_curve_data['fpr_train'] + tpr = roc_curve_data['tpr_train'] + + + fpr1 = roc_curve_data['fpr_test'] + tpr1 = roc_curve_data['tpr_test'] + + + fpr_d_tr = array('d', fpr.tolist()) + tpr_d_tr = array('d', tpr.tolist()) + + fpr_d_ts = array('d', fpr1.tolist()) + tpr_d_ts = array('d', tpr1.tolist()) + + train_roc = ROOT.TGraph(len(fpr_d_tr), fpr_d_tr, tpr_d_tr) + test_roc = ROOT.TGraph(len(fpr_d_ts), fpr_d_ts, tpr_d_ts) + + train_roc.SetLineColor(ROOT.kRed + 2) + test_roc.SetLineColor(ROOT.kBlue + 2) + + + train_roc.SetLineWidth(3) + test_roc.SetLineWidth(3) + + + train_roc.SetLineStyle(9) + test_roc.SetLineStyle(9) + + + train_roc.SetTitle("Receiver operating characteristic train") + test_roc.SetTitle("Receiver operating characteristic test") + + train_roc.GetXaxis().SetTitle('FPR'); + train_roc.GetYaxis().SetTitle('TPR'); + + test_roc.GetXaxis().SetTitle('FPR'); + test_roc.GetYaxis().SetTitle('TPR'); + + train_roc.Write() + test_roc.Write() + + __hist_out.Close() + return self.__train_best_thr, self.__test_best_thr @@ -226,6 +313,78 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + __hist_out.cd() + ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') + + + rapidity_orig = array('d', df_orig['rapidity'].values.tolist()) + pT_orig = array('d', df_orig['pT'].values.tolist()) + pT_rap_before_cut = ROOT.TH2D( 'pT_rap_before_ML'+data_name, 'pT_rap_before_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_orig)): + pT_rap_before_cut.Fill(rapidity_orig[i], pT_orig[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_before_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_before_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_before_cut.Draw('COLZ') + + + + + + rapidity_cut = array('d', df_cut['rapidity'].values.tolist()) + pT_cut = array('d', df_cut['pT'].values.tolist()) + + pT_rap_cut = ROOT.TH2D( 'pT_rap_after_ML_'+data_name, 'pT_rap_after_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_cut)): + pT_rap_cut.Fill(rapidity_cut[i], pT_cut[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_cut.Draw('COLZ') + + + + + + rapidity_diff = array('d', difference['rapidity'].values.tolist()) + pT_diff = array('d', difference['pT'].values.tolist()) + + pT_rap_diff = ROOT.TH2D('pT_rap_diff_'+data_name, 'pT_rap_diff_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_diff)): + pT_rap_diff.Fill(rapidity_diff[i], pT_diff[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_diff.GetXaxis().SetTitle('rapidity'); + pT_rap_diff.GetYaxis().SetTitle('pT, GeV'); + pT_rap_diff.Draw('COLZ') + + + pT_rap_before_cut.Write() + pT_rap_cut.Write() + pT_rap_diff.Write() + + __hist_out.Close() + + + def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): """ Applied quality cuts and created distributions for all the features in pdf @@ -242,8 +401,8 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe name of pdf document with distributions """ - # hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - # hist_out.cd() + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + __hist_out.cd() for feature in dfs_orig.columns: @@ -321,4 +480,81 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe fig.tight_layout() plt.savefig(pdf_key,format='pdf') + + dfs_orig_feat = array('d', dfs_orig[feature].values.tolist()) + dfb_orig_feat = array('d', dfb_orig[feature].values.tolist()) + + + dfs_cut_feat = array('d', dfs_cut[feature].values.tolist()) + dfb_cut_feat = array('d', dfb_cut[feature].values.tolist()) + + + dfs_diff_feat = array('d', difference_s[feature].values.tolist()) + + + dfs_orig_root = ROOT.TH1D('signal before ML '+feature, 'signal before ML '+feature, 500, + min(dfs_orig[feature].values.tolist()), max(dfs_orig[feature].values.tolist())) + + for i in range(len(dfs_orig_feat)): + dfs_orig_root.Fill(dfs_orig_feat[i]) + + dfs_orig_root.GetXaxis().SetTitle(feature); + + dfs_orig_root.Draw() + + dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, + min(dfb_orig[feature].values.tolist()), max(dfb_orig[feature].values.tolist())) + + for i in range(len(dfb_orig_feat)): + dfb_orig_root.Fill(dfb_orig_feat[i]) + + dfb_orig_root.GetXaxis().SetTitle(feature); + dfb_orig_root.Draw() + + + dfs_cut_root = ROOT.TH1D('signal after ML '+feature, 'signal after ML '+feature, 500, + min(dfs_cut[feature].values.tolist()), max(dfs_cut[feature].values.tolist())) + + for i in range(len(dfs_cut_feat)): + dfs_cut_root.Fill(dfs_cut_feat[i]) + + dfs_cut_root.GetXaxis().SetTitle(feature); + dfs_cut_root.Draw() + + + dfb_cut_root = ROOT.TH1D('background after ML '+feature, 'background after ML '+feature, 500, + min(dfb_cut[feature].values.tolist()), max(dfb_cut[feature].values.tolist())) + + for i in range(len(dfb_cut_feat)): + dfb_cut_root.Fill(dfb_cut_feat[i]) + + dfb_cut_root.GetXaxis().SetTitle(feature); + dfb_cut_root.Draw() + + + dfs_diff_root = ROOT.TH1D('signal difference '+feature, 'signal difference '+feature, 500, + min(difference_s[feature].values.tolist()), max(difference_s[feature].values.tolist())) + + for i in range(len(dfs_diff_feat)): + dfs_diff_root.Fill(dfs_diff_feat[i]) + + dfs_diff_root.GetXaxis().SetTitle(feature); + dfs_diff_root.Draw() + + + __hist_out.cd() + + __hist_out.cd('Signal'+'/'+sample+'/'+'hists') + dfs_orig_root.Write() + dfs_cut_root.Write() + dfs_diff_root.Write() + + __hist_out.cd() + + __hist_out.cd('Background'+'/'+sample+'/'+'hists') + + dfb_orig_root.Write() + dfb_cut_root.Write() + + __hist_out.Close() pdf_key.close() From 41f74f8015ff6d20a0e8d1379673d6773ae4ba45 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 18 Oct 2021 16:46:55 +0200 Subject: [PATCH 17/46] Deleted files with an old version of XGBoost code --- MLconfig_XGboostParameters.py | 58 --- MLconfig_variables.py | 271 ------------- train_test_xgboost.py | 704 ---------------------------------- 3 files changed, 1033 deletions(-) delete mode 100644 MLconfig_XGboostParameters.py delete mode 100644 MLconfig_variables.py delete mode 100644 train_test_xgboost.py diff --git a/MLconfig_XGboostParameters.py b/MLconfig_XGboostParameters.py deleted file mode 100644 index fcf6798..0000000 --- a/MLconfig_XGboostParameters.py +++ /dev/null @@ -1,58 +0,0 @@ -import numpy as np -import pandas as pd -import xgboost as xgb -import matplotlib.pyplot as plt - -from bayes_opt import BayesianOptimization -import gc - - -class XGBoostParams: - - def __init__(self, params, dtrain): - self.params = params - self.dtrain = dtrain - - def bo_tune_xgb(self, max_depth, gamma, alpha, n_estimators ,learning_rate): - - params1 = {'max_depth': int(max_depth), - 'gamma': gamma, - 'alpha':alpha, - 'n_estimators': n_estimators, - 'learning_rate':learning_rate, - 'subsample': 0.8, - 'eta': 0.3, - 'eval_metric': 'auc', 'nthread' : 7} - - cv_result = xgb.cv(params1, self.dtrain, num_boost_round=10, nfold=5) - return cv_result['test-auc-mean'].iloc[-1] - - def get_best_params(self): - """ - Performs Bayesian Optimization and looks for the best parameters - - Parameters: - None - """ - #Invoking the Bayesian Optimizer with the specified parameters to tune - xgb_bo = BayesianOptimization(self.bo_tune_xgb, {'max_depth': (4, 10), - 'gamma': (0, 1), - 'alpha': (2,20), - 'learning_rate':(0,1), - 'n_estimators':(100,500) - }) - #performing Bayesian optimization for 5 iterations with 8 steps of random exploration - # with an #acquisition function of expected improvement - xgb_bo.maximize(n_iter=1, init_points=1) - - max_param = xgb_bo.max['params'] - param1= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], - 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), - 'objective': 'reg:logistic'} - gc.collect() - - - #To train the algorithm using the parameters selected by bayesian optimization - #Fit/train on training data - bst = xgb.train(param1, self.dtrain) - return bst diff --git a/MLconfig_variables.py b/MLconfig_variables.py deleted file mode 100644 index 765bf66..0000000 --- a/MLconfig_variables.py +++ /dev/null @@ -1,271 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -from scipy.stats import sem -from scipy.stats import binned_statistic as b_s -import matplotlib as mpl - -from hipe4ml import plot_utils - -def correlation_matrix(bgr, sign, vars_to_draw, leg_labels, output_path): - res_s_b = plot_utils.plot_corr([bgr, sign], vars_to_draw, leg_labels) - res_s_b[0].savefig(output_path+'/'+'corr_matrix_bgr.png') - res_s_b[1].savefig(output_path+'/'+'corr_matrix_sign.png') - - - -def calculate_correlation(df, vars_to_corr, target_var) : - """ - Calculates correlations with target variable variable and standart errors - Parameters - ------------------------------------------------ - df: pandas.DataFrame - imput data - vars_to_corr: list of str - variables that correlate with target value - target_var: str - variable that correlates with another variables mentioned in list - """ - - - mean = df[target_var].mean() - sigma = df[target_var].std() - - correlation = [] - error = [] - - for j in vars_to_corr : - mean_j = df[j].mean() - sigma_j = df[j].std() - - cov = (df[j] - mean_j) * (df[target_var] - mean) / (sigma*sigma_j) - correlation.append(cov.mean()) - error.append(sem(cov)) - - return correlation, error - - -def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, corr_bg, corr_bg_errors, output_path): - """ - Plots correlations - Parameters - ------------------------------------------------ - vars_to_draw: list of str - variables that correlate with target value - var_to_corr: str - variables that correlate with target value - corr_signal: list - signal covariance coefficient between variable and target variable - corr_signal_errors: - signal covariance standart error of the mean - corr_bg: list - background covariance coefficient between variable and target variable - corr_bg_errors: - background covariance standart error of the mean - output_path: - path that contains output plot - - """ - - fig, ax = plt.subplots(figsize=(20,10)) - plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='') - plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='') - ax.grid(zorder=0) - ax.set_xticklabels(vars_to_draw, fontsize=30, rotation =70) - # ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) - ax.yaxis.set_tick_params(labelsize=30) - plt.legend(('signal','background'), fontsize = 25) - plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 30) - plt.ylabel('Correlation coefficient', fontsize = 30) - fig.tight_layout() - fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png') - - - -def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): - """ - This function takes the entries of the variables and distributes them in 25 bins. - The function then plots the bin centers of the first variable on the x-axis and - the mean values of the bins of the second variable on the y-axis, along with its bin stds. - - Parameters - ------------------------------------------------ - df: pandas.DataFrame - input DataFrame - - variable_xaxis: str - variable to be plotted on x axis (invariant mass) - - x_unit: str - x axis variable units - - variable_yaxis: str - variable to be plotted on y axis - - sgn: int(0 or 1) - signal definition(0 background, 1 signal) - - pdf_key: matplotlib.backends.backend_pdf.PdfPages - output pdf file - - peak: int - invariant mass peak position - - edge_left: int - left edge of x axis variable - - edge_right: int - left edge of y axis variable - """ - - if sign == 1: - keyword = 'signal' - if sign == 0: - keyword = 'background' - - df = df[(df[variable_xaxis] < edge_right) & (df[variable_xaxis] > edge_left)] - - for var in df.columns: - if var != variable_xaxis: - - fig, axs = plt.subplots(figsize=(20, 15)) - - bin_means, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='mean', bins=25) - bin_std, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='std', bins=25) - bin_count, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='count',bins= 25) - bin_width = (bin_edges[1] - bin_edges[0]) - bin_centers = bin_edges[1:] - bin_width/2 - - nan_ind = np.where(np.isnan(bin_means)) - bin_centers = np.delete(bin_centers, nan_ind) - bin_means = np.delete(bin_means, nan_ind) - bin_count = np.delete(bin_count, nan_ind) - bin_std = np.delete(bin_std , nan_ind) - - - plt.errorbar(x=bin_centers, y=bin_means, yerr=(bin_std/np.sqrt(bin_count)), linestyle='none', marker='.',mfc='red', ms=10) - - - - plt.title('Mean of ' +var+ ' plotted versus bin centers of '+variable_xaxis+ \ - '('+keyword+')', fontsize=25) - plt.xlabel('Mass', fontsize=25) - plt.ylabel("Mean of each bin with the SEM ($\dfrac{bin\ std}{\sqrt{bin\ count}}$) of bin", fontsize=25) - - - plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-') - - axs.xaxis.set_tick_params(labelsize=20) - axs.yaxis.set_tick_params(labelsize=20) - fig.tight_layout() - plt.savefig(pdf_key,format='pdf') - - pdf_key.close() - - -def plot2D_all(df, sample, sgn, pdf_key): - """ - Plots 2D distribution between all the variables - Parameters - ------------------------------------------------ - df: pandas.DataFrame - input dataframe - - sample: str - title of the sample - - sgn: int(0 or 1) - signal definition(0 background, 1 signal) - - pdf_key: matplotlib.backends.backend_pdf.PdfPages - output pdf file - """ - - for xvar in df.columns: - for yvar in df.columns: - if xvar!=yvar: - fig, axs = plt.subplots(figsize=(15, 10)) - cax = plt.hist2d(df[xvar],df[yvar],range=[[df[xvar].min(), df[xvar].max()], [df[yvar].min(), df[yvar].max()]], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) - - - if sgn==1: - plt.title('Signal candidates ' + sample, fontsize = 25) - - if sgn==0: - plt.title('Background candidates ' + sample, fontsize = 25) - - - plt.xlabel(xvar, fontsize=25) - plt.ylabel(yvar, fontsize=25) - - - mpl.pyplot.colorbar() - - plt.legend(shadow=True,title =str(len(df))+ " samples") - - fig.tight_layout() - plt.savefig(pdf_key,format='pdf') - pdf_key.close() - - -def plot2D_mass(df, sample, mass_var, mass_range, sgn, peak, pdf_key): - """ - Plots 2D distribution between variable and invariant mass - Parameters - ------------------------------------------------ - df: pandas.DataFrame - input dataframe - - sample: str - title of the sample - - - mass_var: str - name of the invariant mass variable - - mass_range: list - mass range to be plotted - - sgn: int(0 or 1) - signal definition(0 background, 1 signal) - - peak: int - invariant mass value - - pdf_key: matplotlib.backends.backend_pdf.PdfPages - output pdf file - """ - - for var in df.columns: - if var != mass_var: - fig, axs = plt.subplots(figsize=(15, 10)) - cax = plt.hist2d(df[mass_var],df[var],range=[mass_range, [df[var].min(), df[var].max()]], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) - - - if sgn==1: - plt.title('Signal candidates ' + sample, fontsize = 25) - - if sgn==0: - plt.title('Background candidates ' + sample, fontsize = 25) - - - plt.xlabel(mass_var, fontsize=25) - plt.ylabel(var, fontsize=25) - - plt.vlines(x=peak,ymin=df[var].min(),ymax=df[var].max(), color='r', linestyle='-') - - mpl.pyplot.colorbar() - - - axs.xaxis.set_tick_params(labelsize=20) - axs.yaxis.set_tick_params(labelsize=20) - - - plt.legend(shadow=True,title =str(len(df))+ " samples") - - fig.tight_layout() - plt.savefig(pdf_key,format='pdf') - pdf_key.close() diff --git a/train_test_xgboost.py b/train_test_xgboost.py deleted file mode 100644 index 74685aa..0000000 --- a/train_test_xgboost.py +++ /dev/null @@ -1,704 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import xgboost as xgb -from hipe4ml.plot_utils import plot_roc_train_test -from helper import * -from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score - - -from matplotlib.font_manager import FontProperties - -from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, - AutoMinorLocator) - -import gc -import matplotlib as mpl -import ROOT - -from array import array - - -mpl.rc('figure', max_open_warning = 0) - - -class TrainTestXGBoost: - """ - Class used for train and test model on parameters found by Bayesian Optimization - Parameters - ------------------------------------------------- - bst: - model to be used - - dtrain: xgboost Matrix - train dataset - - y_train: - train target label - - dtest: xgboost Matrix - test dataset - - y_test: - test target label - - bst_train: - dataframe with predictions - - - """ - - - def __init__(self, bst, dtrain, y_train, dtest, y_test, output_path): - - self.bst = bst - self.dtrain = dtrain - self.y_train = y_train - self.dtest = dtest - self.y_test = y_test - self.output_path = output_path - - self.__bst_train = None - self.__bst_test = None - - self.__train_best_thr = None - self.__test_best_thr = None - - self.__train_pred = None - self.__test_pred = None - - self.root_output_name = 'hists.root' - - hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - - - hist_out.cd() - - ROOT.gDirectory.mkdir('Signal') - ROOT.gDirectory.mkdir('Background') - - hist_out.cd() - hist_out.cd('Signal') - - - ROOT.gDirectory.mkdir('train') - ROOT.gDirectory.mkdir('test') - - hist_out.cd() - ROOT.gDirectory.cd('Signal/train') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - hist_out.cd() - ROOT.gDirectory.cd('Signal/test') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - - hist_out.cd() - hist_out.cd('Background') - - ROOT.gDirectory.mkdir('train') - ROOT.gDirectory.mkdir('test') - - - hist_out.cd() - ROOT.gDirectory.cd('Background/train') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - hist_out.cd() - ROOT.gDirectory.cd('Background/test') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - hist_out.Close() - - - def apply_predictions(self): - - self.__bst_train= pd.DataFrame(data=self.bst.predict(self.dtrain, output_margin=False), columns=["xgb_preds"]) - self.__bst_train['issignal']=self.y_train - - - self.__bst_test= pd.DataFrame(data=self.bst.predict(self.dtest, output_margin=False), columns=["xgb_preds"]) - self.__bst_test['issignal']=self.y_test - - return self.__bst_train, self.__bst_test - - - def get_threshold(self, train_y, test_y): - self.__train_best_thr, self.__test_best_thr, roc_curve_data = AMS(train_y, - self.__bst_train['xgb_preds'], test_y, self.__bst_test['xgb_preds'], - self.output_path) - - hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - - hist_out.cd() - - fpr = roc_curve_data['fpr_train'] - tpr = roc_curve_data['tpr_train'] - - - fpr1 = roc_curve_data['fpr_test'] - tpr1 = roc_curve_data['tpr_test'] - - - fpr_d_tr = array('d', fpr.tolist()) - tpr_d_tr = array('d', tpr.tolist()) - - fpr_d_ts = array('d', fpr1.tolist()) - tpr_d_ts = array('d', tpr1.tolist()) - - - train_roc = ROOT.TGraph(len(fpr_d_tr), fpr_d_tr, tpr_d_tr) - test_roc = ROOT.TGraph(len(fpr_d_ts), fpr_d_ts, tpr_d_ts) - - train_roc.SetLineColor(ROOT.kRed + 2) - test_roc.SetLineColor(ROOT.kBlue + 2) - - - train_roc.SetLineWidth(3) - test_roc.SetLineWidth(3) - - - train_roc.SetLineStyle(9) - test_roc.SetLineStyle(9) - - - - train_roc.SetTitle("Receiver operating characteristic train") - test_roc.SetTitle("Receiver operating characteristic test") - - train_roc.GetXaxis().SetTitle('FPR'); - train_roc.GetYaxis().SetTitle('TPR'); - - test_roc.GetXaxis().SetTitle('FPR'); - test_roc.GetYaxis().SetTitle('TPR'); - - train_roc.Write() - test_roc.Write() - - hist_out.Close() - - return self.__train_best_thr, self.__test_best_thr - - - - def apply_threshold(self): - cut_train = self.__train_best_thr - self.__train_pred = ((self.__bst_train['xgb_preds']>cut_train)*1) - - cut_test = self.__test_best_thr - self.__test_pred = ((self.__bst_test['xgb_preds']>cut_test)*1) - - return self.__train_pred, self.__test_pred - - - def get_result(self, x_train, x_test): - - train_with_preds = x_train.copy() - train_with_preds['xgb_preds1'] = self.__train_pred.values - - - test_with_preds = x_test.copy() - test_with_preds['xgb_preds1'] = self.__test_pred.values - - return train_with_preds, test_with_preds - - - - def features_importance(self): - ax = xgb.plot_importance(self.bst) - plt.rcParams['figure.figsize'] = [6, 3] - ax.figure.tight_layout() - ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") - - - - def CM_plot_train_test(self): - """ - Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to - the number of observations known to be in group i and predicted to be in - group j. Thus in binary classification, the count of true positives is C00, - false negatives C01,false positives is C10, and true neagtives is C11. - - Confusion matrix is applied to previously unseen by model data, so we can - estimate model's performance - - Parameters - ---------- - test_best: numpy.float32 - best threshold - - x_train: dataframe - we want to get confusion matrix on training datasets - """ - #lets take the best threshold and look at the confusion matrix - - cnf_matrix_train = confusion_matrix(self.__bst_train['issignal'], self.__train_pred, labels=[1,0]) - np.set_printoptions(precision=2) - fig_train, axs_train = plt.subplots(figsize=(10, 8)) - axs_train.yaxis.set_label_coords(-0.04,.5) - axs_train.xaxis.set_label_coords(0.5,-.005) - plot_confusion_matrix(cnf_matrix_train, classes=['signal','background'], - title=' Train Dataset Confusion Matrix for XGB for cut > '+str(self.__train_best_thr)) - plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_train.png') - - - cnf_matrix_test = confusion_matrix(self.__bst_test['issignal'], self.__test_pred, labels=[1,0]) - np.set_printoptions(precision=2) - fig_test, axs_test = plt.subplots(figsize=(10, 8)) - axs_test.yaxis.set_label_coords(-0.04,.5) - axs_test.xaxis.set_label_coords(0.5,-.005) - plot_confusion_matrix(cnf_matrix_test, classes=['signal','background'], - title=' Test Dataset Confusion Matrix for XGB for cut > '+str(self.__test_best_thr)) - plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') - - - def preds_prob(self, preds, true, dataset): - if dataset =='train': - label1 = 'XGB Predictions on the training data set' - else: - label1 = 'XGB Predictions on the test data set' - fig, ax = plt.subplots(figsize=(12, 8)) - bins1=100 - plt.hist(self.__bst_test[preds], bins=bins1,facecolor='green',alpha = 0.3, label=label1) - - TP = self.__bst_test[(self.__bst_test[true]==1)] - TN = self.__bst_test[(self.__bst_test[true]==0)] - #TP[preds].plot.hist(ax=ax, bins=bins1,facecolor='blue', histtype='stepfilled',alpha = 0.3, label='True Positives/signal in predictions') - hist, bins = np.histogram(TP[preds], bins=bins1) - err = np.sqrt(hist) - center = (bins[:-1] + bins[1:]) / 2 - - - hist1, bins1 = np.histogram(TN[preds], bins=bins1) - err1 = np.sqrt(hist1) - plt.errorbar(center, hist1, yerr=err1, fmt='o', - c='Red', label='Background in predictions') - - plt.errorbar(center, hist, yerr=err, fmt='o', - c='blue', label='Signal in predictions') - - ax.set_yscale('log') - plt.xlabel('Probability',fontsize=18) - plt.ylabel('Counts', fontsize=18) - plt.legend(fontsize=18) - ax.set_xticks(np.arange(0,1.1,0.1)) - ax.tick_params(axis='both', which='major', labelsize=18) - ax.tick_params(axis='both', which='minor', labelsize=16) - plt.show() - fig.tight_layout() - fig.savefig(str(self.output_path)+'/test_best_pred.png') - - - - # def preds_prob1(self, preds,true, preds1, true1): - # fig, ax = plt.subplots(figsize=(12, 8)) - # bins1=100 - # TP = self.__bst_train[(self.__bst_train[true]==1)] - # TN = self.__bst_train[(self.__bst_train[true]==0)] - # - # plt.hist(TN[preds], density=True, bins=bins1,facecolor='red',alpha = 0.3, label='background in train') - # plt.hist(TP[preds], density=True, bins=bins1,facecolor='blue',alpha = 0.3, label='signal in train') - # - # - # TP1 = self.__bst_test[(self.__bst_test[true]==1)] - # TN1 = self.__bst_test[(self.__bst_test[true]==0)] - # - # scale = np.histogram(TN[preds], density = True, bins=bins1)[0] / np.histogram(TN[preds], density = False, bins=bins1)[0] - # - # scale1 = np.histogram(TP[preds], density = True, bins=bins1)[0] / np.histogram(TP[preds], density = False, bins=bins1)[0] - # - # hist, bins_train = np.histogram(TN[preds], density = False, bins=bins1) - # err = np.sqrt(hist) - # center = (bins_train[:-1] + bins_train[1:]) / 2 - # plt.errorbar(center, hist * scale, yerr=err*scale, fmt='o', - # c='red', label='signal in test') - # - # # plt.errorbar(center, hist, fmt='o', - # # c='red', label='signal in test') - # - # hist1, bins_test = np.histogram(TP1[preds1], density = False, bins=bins1) - # err1 = np.sqrt(hist1) - # center1 = (bins_test[:-1] + bins_test[1:]) / 2 - # plt.errorbar(center1, hist1 , yerr=err1, fmt='o', - # c='blue', label='background in test') - # - # # plt.errorbar(center, hist,fmt='o', - # # c='blue', label='background in test') - # - # #ax.annotate('cut on probability', xy=(0, 90), xycoords='data',xytext=(0.25,0.5), textcoords='axes fraction', - # #fontsize=15,arrowprops=dict(facecolor='black', shrink=0.05),horizontalalignment='right', verticalalignment='top') - # - # # if df[true].unique().shape[0]>2: - # # TP2= df[df[true]>1] - # # plt.hist(TP2[preds], bins=bins1,facecolor='green',alpha = 0.3, label='secondaries in train') - # # TP2= df1[df1[true1]>1] - # # hist2, bins2 = np.histogram(TP2[preds1], bins=bins1) - # # center2 = (bins2[:-1] + bins2[1:]) / 2 - # # err2 = np.sqrt(hist2) - # # plt.errorbar(center2, hist2,yerr=err2, fmt='o',c='green',label='secondaries in test') - # - # - # ax.set_yscale('log') - # ax.set_xlabel('Probability',fontsize=20) - # - # ax.xaxis.set_tick_params(labelsize=15) - # ax.yaxis.set_tick_params(labelsize=15) - # - # plt.ylabel('Counts', fontsize=20) - # #ax.set_xticks(np.arange(0,1.1,0.1)) - # plt.legend(fontsize=18) - # plt.show() - # fig.tight_layout() - # fig.savefig(self.output_path+'/'+'Lambda_XGB_prediction_0.jpg') - - - def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): - fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) - - - if sign ==0: - s_label = 'Background' - m = 5 - - if sign==1: - s_label = 'Signal' - m = 1 - - axs[0].set_aspect(aspect = 'auto') - axs[1].set_aspect(aspect = 'auto') - axs[2].set_aspect(aspect = 'auto') - - rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) - saved = round((df_cut.shape[0] / df_orig.shape[0]) * 100, 5) - diff = df_orig.shape[0] - df_cut.shape[0] - axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14) - axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14) - axs[2].legend(shadow=True, title ='ML cut saves \n'+ str(saved) +'% of '+ s_label, fontsize =14) - - - - counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16) - axs[0].set_xlabel('rapidity', fontsize=15) - axs[0].set_ylabel('pT, GeV', fontsize=15) - - - mpl.pyplot.colorbar(im0, ax = axs[0]) - - - - axs[0].xaxis.set_major_locator(MultipleLocator(1)) - axs[0].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[0].xaxis.set_tick_params(which='both', width=2) - - - fig.tight_layout() - - - counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut['rapidity'], df_cut['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16) - axs[1].set_xlabel('rapidity', fontsize=15) - axs[1].set_ylabel('pT, GeV', fontsize=15) - - mpl.pyplot.colorbar(im1, ax = axs[1]) - - axs[1].xaxis.set_major_locator(MultipleLocator(1)) - axs[1].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[1].xaxis.set_tick_params(which='both', width=2) - - fig.tight_layout() - - - counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[2].set_title(s_label + ' difference ', fontsize = 16) - axs[2].set_xlabel('rapidity', fontsize=15) - axs[2].set_ylabel('pT, GeV', fontsize=15) - - mpl.pyplot.colorbar(im1, ax = axs[2]) - - - axs[2].xaxis.set_major_locator(MultipleLocator(1)) - axs[2].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[2].xaxis.set_tick_params(which='both', width=2) - - fig.tight_layout() - - fig.savefig(self.output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') - - - - - hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - - hist_out.cd() - ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') - - - rapidity_orig = array('d', df_orig['rapidity'].values.tolist()) - pT_orig = array('d', df_orig['pT'].values.tolist()) - - pT_rap_before_cut = ROOT.TH2D( 'pT_rap_before_ML'+data_name, 'pT_rap_before_ML_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_orig)): - pT_rap_before_cut.Fill(rapidity_orig[i], pT_orig[i]) - - ROOT.gStyle.SetOptStat(0) - - - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_before_cut.GetXaxis().SetTitle('rapidity'); - pT_rap_before_cut.GetYaxis().SetTitle('pT, GeV'); - - pT_rap_before_cut.Draw('COLZ') - - - - - - rapidity_cut = array('d', df_cut['rapidity'].values.tolist()) - pT_cut = array('d', df_cut['pT'].values.tolist()) - - pT_rap_cut = ROOT.TH2D( 'pT_rap_after_ML_'+data_name, 'pT_rap_after_ML_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_cut)): - pT_rap_cut.Fill(rapidity_cut[i], pT_cut[i]) - - ROOT.gStyle.SetOptStat(0) - - - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_cut.GetXaxis().SetTitle('rapidity'); - pT_rap_cut.GetYaxis().SetTitle('pT, GeV'); - - pT_rap_cut.Draw('COLZ') - - - - - - rapidity_diff = array('d', difference['rapidity'].values.tolist()) - pT_diff = array('d', difference['pT'].values.tolist()) - - pT_rap_diff = ROOT.TH2D('pT_rap_diff_'+data_name, 'pT_rap_diff_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_diff)): - pT_rap_diff.Fill(rapidity_diff[i], pT_diff[i]) - - ROOT.gStyle.SetOptStat(0) - - - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_diff.GetXaxis().SetTitle('rapidity'); - pT_rap_diff.GetYaxis().SetTitle('pT, GeV'); - - pT_rap_diff.Draw('COLZ') - - - pT_rap_before_cut.Write() - pT_rap_cut.Write() - pT_rap_diff.Write() - - hist_out.Close() - - - - def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): - """ - Applied quality cuts and created distributions for all the features in pdf - file - Parameters - ---------- - df_s: dataframe - signal - df_b: dataframe - background - feature: str - name of the feature to be plotted - pdf_key: PdfPages object - name of pdf document with distributions - """ - - hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - hist_out.cd() - - - for feature in dfs_orig.columns: - fig, ax = plt.subplots(3, figsize=(20, 10)) - - - fontP = FontProperties() - fontP.set_size('xx-large') - - ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') - ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + - - '\n S samples: '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0]) + - '\nquality cuts ', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[0].xaxis.set_tick_params(labelsize=15) - ax[0].yaxis.set_tick_params(labelsize=15) - - ax[0].set_title(str(feature) + ' MC '+ sample + ' before ML cut', fontsize = 25) - ax[0].set_xlabel(feature, fontsize = 25) - - if feature!=mass_var: - ax[0].set_yscale('log') - - fig.tight_layout() - - - ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') - ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + - '\n S samples: '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) + - '\nquality cuts + ML cut', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - - ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[1].xaxis.set_tick_params(labelsize=15) - ax[1].yaxis.set_tick_params(labelsize=15) - - ax[1].set_title(feature + ' MC '+ sample+ ' after ML cut', fontsize = 25) - ax[1].set_xlabel(feature, fontsize = 25) - - if feature!='mass': - ax[1].set_yscale('log') - - fig.tight_layout() - - - - - ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[2].legend(shadow=True,title ='S samples: '+str(len(difference_s)) +'\nsignal difference', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - - ax[2].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[2].xaxis.set_tick_params(labelsize=15) - ax[2].yaxis.set_tick_params(labelsize=15) - - ax[2].set_title(feature + ' MC '+ sample +' signal difference', fontsize = 25) - ax[2].set_xlabel(feature, fontsize = 25) - - if feature!=mass_var: - ax[2].set_yscale('log') - - fig.tight_layout() - - plt.savefig(pdf_key,format='pdf') - - - - - - dfs_orig_feat = array('d', dfs_orig[feature].values.tolist()) - dfb_orig_feat = array('d', dfb_orig[feature].values.tolist()) - - - dfs_cut_feat = array('d', dfs_cut[feature].values.tolist()) - dfb_cut_feat = array('d', dfb_cut[feature].values.tolist()) - - - dfs_diff_feat = array('d', difference_s[feature].values.tolist()) - - - dfs_orig_root = ROOT.TH1D('signal before ML '+feature, 'signal before ML '+feature, 500, - min(dfs_orig[feature].values.tolist()), max(dfs_orig[feature].values.tolist())) - - for i in range(len(dfs_orig_feat)): - dfs_orig_root.Fill(dfs_orig_feat[i]) - - dfs_orig_root.GetXaxis().SetTitle(feature); - - dfs_orig_root.Draw() - - dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, - min(dfb_orig[feature].values.tolist()), max(dfb_orig[feature].values.tolist())) - - for i in range(len(dfb_orig_feat)): - dfb_orig_root.Fill(dfb_orig_feat[i]) - - dfb_orig_root.GetXaxis().SetTitle(feature); - dfb_orig_root.Draw() - - - dfs_cut_root = ROOT.TH1D('signal after ML '+feature, 'signal after ML '+feature, 500, - min(dfs_cut[feature].values.tolist()), max(dfs_cut[feature].values.tolist())) - - for i in range(len(dfs_cut_feat)): - dfs_cut_root.Fill(dfs_cut_feat[i]) - - dfs_cut_root.GetXaxis().SetTitle(feature); - dfs_cut_root.Draw() - - - dfb_cut_root = ROOT.TH1D('background after ML '+feature, 'background after ML '+feature, 500, - min(dfb_cut[feature].values.tolist()), max(dfb_cut[feature].values.tolist())) - - for i in range(len(dfb_cut_feat)): - dfb_cut_root.Fill(dfb_cut_feat[i]) - - dfb_cut_root.GetXaxis().SetTitle(feature); - dfb_cut_root.Draw() - - - dfs_diff_root = ROOT.TH1D('signal difference '+feature, 'signal difference '+feature, 500, - min(difference_s[feature].values.tolist()), max(difference_s[feature].values.tolist())) - - for i in range(len(dfs_diff_feat)): - dfs_diff_root.Fill(dfs_diff_feat[i]) - - dfs_diff_root.GetXaxis().SetTitle(feature); - dfs_diff_root.Draw() - - - hist_out.cd() - - hist_out.cd('Signal'+'/'+sample+'/'+'hists') - dfs_orig_root.Write() - dfs_cut_root.Write() - dfs_diff_root.Write() - - hist_out.cd() - - hist_out.cd('Background'+'/'+sample+'/'+'hists') - - dfb_orig_root.Write() - dfb_cut_root.Write() - - hist_out.Close() - - pdf_key.close() From d30b1eab4d950f5ace79e371ee6506725bfcfc8b Mon Sep 17 00:00:00 2001 From: conformist89 Date: Thu, 21 Oct 2021 11:52:30 +0200 Subject: [PATCH 18/46] Quality insurance added --- MLconfig_variables.py | 251 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 MLconfig_variables.py diff --git a/MLconfig_variables.py b/MLconfig_variables.py new file mode 100644 index 0000000..9221722 --- /dev/null +++ b/MLconfig_variables.py @@ -0,0 +1,251 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +from scipy.stats import sem +from scipy.stats import binned_statistic as b_s +import matplotlib as mpl + +from hipe4ml import plot_utils + +def correlation_matrix(bgr, sign, vars_to_draw, leg_labels, output_path): + res_s_b = plot_utils.plot_corr([bgr, sign], vars_to_draw, leg_labels) + res_s_b[0].savefig(output_path+'/'+'corr_matrix_bgr.png') + res_s_b[1].savefig(output_path+'/'+'corr_matrix_sign.png') + + + +def calculate_correlation(df, vars_to_corr, target_var) : + """ + Calculates correlations with target variable variable and standart errors + Parameters + ------------------------------------------------ + df: pandas.DataFrame + imput data + vars_to_corr: list of str + variables that correlate with target value + target_var: str + variable that correlates with another variables mentioned in list + """ + + + mean = df[target_var].mean() + sigma = df[target_var].std() + + correlation = [] + error = [] + + for j in vars_to_corr : + mean_j = df[j].mean() + sigma_j = df[j].std() + + cov = (df[j] - mean_j) * (df[target_var] - mean) / (sigma*sigma_j) + correlation.append(cov.mean()) + error.append(sem(cov)) + + return correlation, error + + +def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, corr_bg, corr_bg_errors, output_path): + """ + Plots correlations + Parameters + ------------------------------------------------ + vars_to_draw: list of str + variables that correlate with target value + var_to_corr: str + variables that correlate with target value + corr_signal: list + signal covariance coefficient between variable and target variable + corr_signal_errors: + signal covariance standart error of the mean + corr_bg: list + background covariance coefficient between variable and target variable + corr_bg_errors: + background covariance standart error of the mean + output_path: + path that contains output plot + """ + + fig, ax = plt.subplots(figsize=(20,10)) + plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='') + plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='') + ax.grid(zorder=0) + ax.set_xticklabels(vars_to_draw, fontsize=30, rotation =70) + # ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) + ax.yaxis.set_tick_params(labelsize=30) + plt.legend(('signal','background'), fontsize = 25) + plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 30) + plt.ylabel('Correlation coefficient', fontsize = 30) + fig.tight_layout() + fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png') + + + +def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): + """ + This function takes the entries of the variables and distributes them in 25 bins. + The function then plots the bin centers of the first variable on the x-axis and + the mean values of the bins of the second variable on the y-axis, along with its bin stds. + Parameters + ------------------------------------------------ + df: pandas.DataFrame + input DataFrame + variable_xaxis: str + variable to be plotted on x axis (invariant mass) + x_unit: str + x axis variable units + variable_yaxis: str + variable to be plotted on y axis + sgn: int(0 or 1) + signal definition(0 background, 1 signal) + pdf_key: matplotlib.backends.backend_pdf.PdfPages + output pdf file + peak: int + invariant mass peak position + edge_left: int + left edge of x axis variable + edge_right: int + left edge of y axis variable + """ + + if sign == 1: + keyword = 'signal' + if sign == 0: + keyword = 'background' + + df = df[(df[variable_xaxis] < edge_right) & (df[variable_xaxis] > edge_left)] + + for var in df.columns: + if var != variable_xaxis: + + fig, axs = plt.subplots(figsize=(20, 15)) + + bin_means, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='mean', bins=25) + bin_std, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='std', bins=25) + bin_count, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='count',bins= 25) + bin_width = (bin_edges[1] - bin_edges[0]) + bin_centers = bin_edges[1:] - bin_width/2 + + nan_ind = np.where(np.isnan(bin_means)) + bin_centers = np.delete(bin_centers, nan_ind) + bin_means = np.delete(bin_means, nan_ind) + bin_count = np.delete(bin_count, nan_ind) + bin_std = np.delete(bin_std , nan_ind) + + + plt.errorbar(x=bin_centers, y=bin_means, yerr=(bin_std/np.sqrt(bin_count)), linestyle='none', marker='.',mfc='red', ms=10) + + + + plt.title('Mean of ' +var+ ' plotted versus bin centers of '+variable_xaxis+ \ + '('+keyword+')', fontsize=25) + plt.xlabel('Mass', fontsize=25) + plt.ylabel("Mean of each bin with the SEM ($\dfrac{bin\ std}{\sqrt{bin\ count}}$) of bin", fontsize=25) + + + plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-') + + axs.xaxis.set_tick_params(labelsize=20) + axs.yaxis.set_tick_params(labelsize=20) + fig.tight_layout() + plt.savefig(pdf_key,format='pdf') + + pdf_key.close() + + +def plot2D_all(df, sample, sgn, pdf_key): + """ + Plots 2D distribution between all the variables + Parameters + ------------------------------------------------ + df: pandas.DataFrame + input dataframe + sample: str + title of the sample + sgn: int(0 or 1) + signal definition(0 background, 1 signal) + pdf_key: matplotlib.backends.backend_pdf.PdfPages + output pdf file + """ + + for xvar in df.columns: + for yvar in df.columns: + if xvar!=yvar: + fig, axs = plt.subplots(figsize=(15, 10)) + cax = plt.hist2d(df[xvar],df[yvar],range=[[df[xvar].min(), df[xvar].max()], [df[yvar].min(), df[yvar].max()]], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) + + + if sgn==1: + plt.title('Signal candidates ' + sample, fontsize = 25) + + if sgn==0: + plt.title('Background candidates ' + sample, fontsize = 25) + + + plt.xlabel(xvar, fontsize=25) + plt.ylabel(yvar, fontsize=25) + + + mpl.pyplot.colorbar() + + plt.legend(shadow=True,title =str(len(df))+ " samples") + + fig.tight_layout() + plt.savefig(pdf_key,format='pdf') + pdf_key.close() + + +def plot2D_mass(df, sample, mass_var, mass_range, sgn, peak, pdf_key): + """ + Plots 2D distribution between variable and invariant mass + Parameters + ------------------------------------------------ + df: pandas.DataFrame + input dataframe + sample: str + title of the sample + mass_var: str + name of the invariant mass variable + mass_range: list + mass range to be plotted + sgn: int(0 or 1) + signal definition(0 background, 1 signal) + peak: int + invariant mass value + pdf_key: matplotlib.backends.backend_pdf.PdfPages + output pdf file + """ + + for var in df.columns: + if var != mass_var: + fig, axs = plt.subplots(figsize=(15, 10)) + cax = plt.hist2d(df[mass_var],df[var],range=[mass_range, [df[var].min(), df[var].max()]], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) + + + if sgn==1: + plt.title('Signal candidates ' + sample, fontsize = 25) + + if sgn==0: + plt.title('Background candidates ' + sample, fontsize = 25) + + + plt.xlabel(mass_var, fontsize=25) + plt.ylabel(var, fontsize=25) + + plt.vlines(x=peak,ymin=df[var].min(),ymax=df[var].max(), color='r', linestyle='-') + + mpl.pyplot.colorbar() + + + axs.xaxis.set_tick_params(labelsize=20) + axs.yaxis.set_tick_params(labelsize=20) + + + plt.legend(shadow=True,title =str(len(df))+ " samples") + + fig.tight_layout() + plt.savefig(pdf_key,format='pdf') + pdf_key.close() From be89cc7802a73c0bab8320a14d6c887a628b1f88 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Thu, 21 Oct 2021 11:56:34 +0200 Subject: [PATCH 19/46] Added names of roc plots --- apply_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apply_model.py b/apply_model.py index 683d564..3984c20 100644 --- a/apply_model.py +++ b/apply_model.py @@ -149,8 +149,8 @@ def get_threshold(self): test_roc.GetXaxis().SetTitle('FPR'); test_roc.GetYaxis().SetTitle('TPR'); - train_roc.Write() - test_roc.Write() + train_roc.Write("Train_roc") + test_roc.Write("Test_roc") __hist_out.Close() From 169528270b4ff8e85c0716aaa265cc02b7b6252a Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 25 Oct 2021 20:48:05 +0300 Subject: [PATCH 20/46] Reading information from config files --- config_reader.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ read_configs.py | 22 -------------------- 2 files changed, 53 insertions(+), 22 deletions(-) create mode 100644 config_reader.py delete mode 100644 read_configs.py diff --git a/config_reader.py b/config_reader.py new file mode 100644 index 0000000..a98e7df --- /dev/null +++ b/config_reader.py @@ -0,0 +1,53 @@ +from hipe4ml.tree_handler import TreeHandler +import tomli +import sys + + +def convertDF(input_file, mass_var): + """ + Opens input file in toml format, retrives signal, background and deploy data + like TreeHandler objects + + Parameters + ------------------------------------------------ + df: str + input toml file + """ + with open(str(input_file), encoding="utf-8") as inp_file: + inp_info = tomli.load(inp_file) + + signal = TreeHandler(inp_info["signal"]["path"], inp_info["signal"]["tree"]) + background = TreeHandler(inp_info["background"]["path"], inp_info["background"]["tree"]) + + + + + + bgr_left_edge = inp_info["peak_range"]["bgr_left_edge"] + bgr_right_edge = inp_info["peak_range"]["bgr_right_edge"] + + peak_left_edge = inp_info["peak_range"]["sgn_left_edge"] + peak_right_edge = inp_info["peak_range"]["sgn_right_edge"] + + + selection = str(bgr_left_edge)+'< '+mass_var+' <'+str(peak_left_edge)+' or '+str(peak_right_edge)+\ + '< '+mass_var+' <'+str(bgr_right_edge) + + signalH = signal.get_subset(size = inp_info["number_of_events"]["number_of_signal_events"]) + bkgH = background.get_subset(selection, size=inp_info["number_of_events"]["number_of_background_events"]) + + return signalH, bkgH + + +def read_log_vars(inp_file): + with open(str(inp_file), encoding="utf-8") as inp_file: + inp_dict = tomli.load(inp_file) + + return inp_dict['log_scale']['variables'], inp_dict['log_scale']['variables'] + + + +def read_train_vars(inp_file): + with open(str(inp_file), encoding="utf-8") as inp_file: + inp_dict = tomli.load(inp_file) + return inp_dict['train_vars'] diff --git a/read_configs.py b/read_configs.py deleted file mode 100644 index 96de294..0000000 --- a/read_configs.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np -import pandas as pd -import tomli - - -def read_log_vars(inp_file): - with open(str(inp_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - - return inp_dict['non_log_variables'], inp_dict['log_variables'] - - -def read_peak(inp_file): - with open(str(inp_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - return inp_dict - - -def read_train_vars(inp_file): - with open(str(inp_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - return inp_dict['train_vars'] From 4ac5999abcfc8627f69ea5ca0c0b660f07a004bc Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 25 Oct 2021 21:07:33 +0300 Subject: [PATCH 21/46] Changed config reader module's name --- helper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/helper.py b/helper.py index 02da4e0..182c8b6 100644 --- a/helper.py +++ b/helper.py @@ -2,7 +2,7 @@ import pandas as pd import matplotlib.pyplot as plt -from read_configs import * +from config_reader import * import xgboost as xgb from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score @@ -11,6 +11,7 @@ import treelite + def transform_df_to_log(df, vars, inp_file): """ Transforms DataFrame to DataFrame with features in log scale From 48e71230ee00694d9086cacd2ca5c301950fcd73 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 25 Oct 2021 21:28:36 +0300 Subject: [PATCH 22/46] Put files to installation folder --- .../MLconfig_variables.py | 0 apply_model.py => CandidatesClassifier0/apply_model.py | 0 config_reader.py => CandidatesClassifier0/config_reader.py | 0 helper.py => CandidatesClassifier0/helper.py | 0 hipe_conf_params.py => CandidatesClassifier0/hipe_conf_params.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename MLconfig_variables.py => CandidatesClassifier0/MLconfig_variables.py (100%) rename apply_model.py => CandidatesClassifier0/apply_model.py (100%) rename config_reader.py => CandidatesClassifier0/config_reader.py (100%) rename helper.py => CandidatesClassifier0/helper.py (100%) rename hipe_conf_params.py => CandidatesClassifier0/hipe_conf_params.py (100%) diff --git a/MLconfig_variables.py b/CandidatesClassifier0/MLconfig_variables.py similarity index 100% rename from MLconfig_variables.py rename to CandidatesClassifier0/MLconfig_variables.py diff --git a/apply_model.py b/CandidatesClassifier0/apply_model.py similarity index 100% rename from apply_model.py rename to CandidatesClassifier0/apply_model.py diff --git a/config_reader.py b/CandidatesClassifier0/config_reader.py similarity index 100% rename from config_reader.py rename to CandidatesClassifier0/config_reader.py diff --git a/helper.py b/CandidatesClassifier0/helper.py similarity index 100% rename from helper.py rename to CandidatesClassifier0/helper.py diff --git a/hipe_conf_params.py b/CandidatesClassifier0/hipe_conf_params.py similarity index 100% rename from hipe_conf_params.py rename to CandidatesClassifier0/hipe_conf_params.py From c5770cf923091736019b62998ef3b2acbda9a403 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 25 Oct 2021 21:29:01 +0300 Subject: [PATCH 23/46] Setip tools added --- setup.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..93a33d4 --- /dev/null +++ b/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup, find_packages + + +setup( + name="CandidatesClassifier", + version="0.1.0", + author="olha lavoryk", + license="MIT", + url="https://github.com/conformist89/CandidatesClassifier.git", + packages=find_packages() +) From 1905959d6fe9da02eb4d14660b416f2137b685ec Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 25 Oct 2021 21:29:27 +0300 Subject: [PATCH 24/46] MIT licwnse added --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7995e55 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 CandidatesClassifier + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From c650800e4deec190dae20faf57c3360a4cda62b2 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 25 Oct 2021 21:33:55 +0300 Subject: [PATCH 25/46] Added files --- cand_class/MLconfig_variables.py | 251 ++++++++++++++ cand_class/apply_model.py | 560 +++++++++++++++++++++++++++++++ cand_class/config_reader.py | 53 +++ cand_class/helper.py | 221 ++++++++++++ cand_class/hipe_conf_params.py | 64 ++++ 5 files changed, 1149 insertions(+) create mode 100644 cand_class/MLconfig_variables.py create mode 100644 cand_class/apply_model.py create mode 100644 cand_class/config_reader.py create mode 100644 cand_class/helper.py create mode 100644 cand_class/hipe_conf_params.py diff --git a/cand_class/MLconfig_variables.py b/cand_class/MLconfig_variables.py new file mode 100644 index 0000000..9221722 --- /dev/null +++ b/cand_class/MLconfig_variables.py @@ -0,0 +1,251 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +from scipy.stats import sem +from scipy.stats import binned_statistic as b_s +import matplotlib as mpl + +from hipe4ml import plot_utils + +def correlation_matrix(bgr, sign, vars_to_draw, leg_labels, output_path): + res_s_b = plot_utils.plot_corr([bgr, sign], vars_to_draw, leg_labels) + res_s_b[0].savefig(output_path+'/'+'corr_matrix_bgr.png') + res_s_b[1].savefig(output_path+'/'+'corr_matrix_sign.png') + + + +def calculate_correlation(df, vars_to_corr, target_var) : + """ + Calculates correlations with target variable variable and standart errors + Parameters + ------------------------------------------------ + df: pandas.DataFrame + imput data + vars_to_corr: list of str + variables that correlate with target value + target_var: str + variable that correlates with another variables mentioned in list + """ + + + mean = df[target_var].mean() + sigma = df[target_var].std() + + correlation = [] + error = [] + + for j in vars_to_corr : + mean_j = df[j].mean() + sigma_j = df[j].std() + + cov = (df[j] - mean_j) * (df[target_var] - mean) / (sigma*sigma_j) + correlation.append(cov.mean()) + error.append(sem(cov)) + + return correlation, error + + +def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, corr_bg, corr_bg_errors, output_path): + """ + Plots correlations + Parameters + ------------------------------------------------ + vars_to_draw: list of str + variables that correlate with target value + var_to_corr: str + variables that correlate with target value + corr_signal: list + signal covariance coefficient between variable and target variable + corr_signal_errors: + signal covariance standart error of the mean + corr_bg: list + background covariance coefficient between variable and target variable + corr_bg_errors: + background covariance standart error of the mean + output_path: + path that contains output plot + """ + + fig, ax = plt.subplots(figsize=(20,10)) + plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='') + plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='') + ax.grid(zorder=0) + ax.set_xticklabels(vars_to_draw, fontsize=30, rotation =70) + # ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) + ax.yaxis.set_tick_params(labelsize=30) + plt.legend(('signal','background'), fontsize = 25) + plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 30) + plt.ylabel('Correlation coefficient', fontsize = 30) + fig.tight_layout() + fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png') + + + +def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): + """ + This function takes the entries of the variables and distributes them in 25 bins. + The function then plots the bin centers of the first variable on the x-axis and + the mean values of the bins of the second variable on the y-axis, along with its bin stds. + Parameters + ------------------------------------------------ + df: pandas.DataFrame + input DataFrame + variable_xaxis: str + variable to be plotted on x axis (invariant mass) + x_unit: str + x axis variable units + variable_yaxis: str + variable to be plotted on y axis + sgn: int(0 or 1) + signal definition(0 background, 1 signal) + pdf_key: matplotlib.backends.backend_pdf.PdfPages + output pdf file + peak: int + invariant mass peak position + edge_left: int + left edge of x axis variable + edge_right: int + left edge of y axis variable + """ + + if sign == 1: + keyword = 'signal' + if sign == 0: + keyword = 'background' + + df = df[(df[variable_xaxis] < edge_right) & (df[variable_xaxis] > edge_left)] + + for var in df.columns: + if var != variable_xaxis: + + fig, axs = plt.subplots(figsize=(20, 15)) + + bin_means, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='mean', bins=25) + bin_std, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='std', bins=25) + bin_count, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='count',bins= 25) + bin_width = (bin_edges[1] - bin_edges[0]) + bin_centers = bin_edges[1:] - bin_width/2 + + nan_ind = np.where(np.isnan(bin_means)) + bin_centers = np.delete(bin_centers, nan_ind) + bin_means = np.delete(bin_means, nan_ind) + bin_count = np.delete(bin_count, nan_ind) + bin_std = np.delete(bin_std , nan_ind) + + + plt.errorbar(x=bin_centers, y=bin_means, yerr=(bin_std/np.sqrt(bin_count)), linestyle='none', marker='.',mfc='red', ms=10) + + + + plt.title('Mean of ' +var+ ' plotted versus bin centers of '+variable_xaxis+ \ + '('+keyword+')', fontsize=25) + plt.xlabel('Mass', fontsize=25) + plt.ylabel("Mean of each bin with the SEM ($\dfrac{bin\ std}{\sqrt{bin\ count}}$) of bin", fontsize=25) + + + plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-') + + axs.xaxis.set_tick_params(labelsize=20) + axs.yaxis.set_tick_params(labelsize=20) + fig.tight_layout() + plt.savefig(pdf_key,format='pdf') + + pdf_key.close() + + +def plot2D_all(df, sample, sgn, pdf_key): + """ + Plots 2D distribution between all the variables + Parameters + ------------------------------------------------ + df: pandas.DataFrame + input dataframe + sample: str + title of the sample + sgn: int(0 or 1) + signal definition(0 background, 1 signal) + pdf_key: matplotlib.backends.backend_pdf.PdfPages + output pdf file + """ + + for xvar in df.columns: + for yvar in df.columns: + if xvar!=yvar: + fig, axs = plt.subplots(figsize=(15, 10)) + cax = plt.hist2d(df[xvar],df[yvar],range=[[df[xvar].min(), df[xvar].max()], [df[yvar].min(), df[yvar].max()]], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) + + + if sgn==1: + plt.title('Signal candidates ' + sample, fontsize = 25) + + if sgn==0: + plt.title('Background candidates ' + sample, fontsize = 25) + + + plt.xlabel(xvar, fontsize=25) + plt.ylabel(yvar, fontsize=25) + + + mpl.pyplot.colorbar() + + plt.legend(shadow=True,title =str(len(df))+ " samples") + + fig.tight_layout() + plt.savefig(pdf_key,format='pdf') + pdf_key.close() + + +def plot2D_mass(df, sample, mass_var, mass_range, sgn, peak, pdf_key): + """ + Plots 2D distribution between variable and invariant mass + Parameters + ------------------------------------------------ + df: pandas.DataFrame + input dataframe + sample: str + title of the sample + mass_var: str + name of the invariant mass variable + mass_range: list + mass range to be plotted + sgn: int(0 or 1) + signal definition(0 background, 1 signal) + peak: int + invariant mass value + pdf_key: matplotlib.backends.backend_pdf.PdfPages + output pdf file + """ + + for var in df.columns: + if var != mass_var: + fig, axs = plt.subplots(figsize=(15, 10)) + cax = plt.hist2d(df[mass_var],df[var],range=[mass_range, [df[var].min(), df[var].max()]], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) + + + if sgn==1: + plt.title('Signal candidates ' + sample, fontsize = 25) + + if sgn==0: + plt.title('Background candidates ' + sample, fontsize = 25) + + + plt.xlabel(mass_var, fontsize=25) + plt.ylabel(var, fontsize=25) + + plt.vlines(x=peak,ymin=df[var].min(),ymax=df[var].max(), color='r', linestyle='-') + + mpl.pyplot.colorbar() + + + axs.xaxis.set_tick_params(labelsize=20) + axs.yaxis.set_tick_params(labelsize=20) + + + plt.legend(shadow=True,title =str(len(df))+ " samples") + + fig.tight_layout() + plt.savefig(pdf_key,format='pdf') + pdf_key.close() diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py new file mode 100644 index 0000000..3984c20 --- /dev/null +++ b/cand_class/apply_model.py @@ -0,0 +1,560 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import xgboost as xgb +from hipe4ml.plot_utils import plot_roc_train_test +from helper import * +from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score + +from dataclasses import dataclass + +from matplotlib.font_manager import FontProperties + +from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, + AutoMinorLocator) + +import gc +import matplotlib as mpl +import ROOT + +from array import array + + +mpl.rc('figure', max_open_warning = 0) + + +@dataclass +class ApplyXGB: + + y_pred_train : np.ndarray + y_pred_test : np.ndarray + + y_train : np.ndarray + y_test : np.ndarray + + output_path : str + + root_output_name = 'hists.root' + + __bst_train : pd = pd.DataFrame() + __bst_test : pd = pd.DataFrame() + + __train_best_thr : int = 0 + __test_best_thr : int = 0 + + + def __post_init__(self): + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE") + __hist_out.cd() + + ROOT.gDirectory.mkdir('Signal') + ROOT.gDirectory.mkdir('Background') + + __hist_out.cd() + __hist_out.cd('Signal') + + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + __hist_out.cd() + ROOT.gDirectory.cd('Signal/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.cd() + ROOT.gDirectory.cd('Signal/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + + __hist_out.cd() + __hist_out.cd('Background') + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + + __hist_out.cd() + ROOT.gDirectory.cd('Background/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.cd() + ROOT.gDirectory.cd('Background/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.Close() + + + def apply_predictions(self): + + self.__bst_train["xgb_preds"] = self.y_pred_train + self.__bst_train['issignal'] = self.y_train + + + self.__bst_test["xgb_preds"] = self.y_pred_test + self.__bst_test['issignal'] = self.y_test + + return self.__bst_train, self.__bst_test + + + def get_threshold(self): + + self.__train_best_thr, self.__test_best_thr, roc_curve_data = AMS(self.y_train, + self.__bst_train['xgb_preds'], self.y_test, self.__bst_test['xgb_preds'], + self.output_path) + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + + __hist_out.cd() + + fpr = roc_curve_data['fpr_train'] + tpr = roc_curve_data['tpr_train'] + + + fpr1 = roc_curve_data['fpr_test'] + tpr1 = roc_curve_data['tpr_test'] + + + fpr_d_tr = array('d', fpr.tolist()) + tpr_d_tr = array('d', tpr.tolist()) + + fpr_d_ts = array('d', fpr1.tolist()) + tpr_d_ts = array('d', tpr1.tolist()) + + train_roc = ROOT.TGraph(len(fpr_d_tr), fpr_d_tr, tpr_d_tr) + test_roc = ROOT.TGraph(len(fpr_d_ts), fpr_d_ts, tpr_d_ts) + + train_roc.SetLineColor(ROOT.kRed + 2) + test_roc.SetLineColor(ROOT.kBlue + 2) + + + train_roc.SetLineWidth(3) + test_roc.SetLineWidth(3) + + + train_roc.SetLineStyle(9) + test_roc.SetLineStyle(9) + + + train_roc.SetTitle("Receiver operating characteristic train") + test_roc.SetTitle("Receiver operating characteristic test") + + train_roc.GetXaxis().SetTitle('FPR'); + train_roc.GetYaxis().SetTitle('TPR'); + + test_roc.GetXaxis().SetTitle('FPR'); + test_roc.GetYaxis().SetTitle('TPR'); + + train_roc.Write("Train_roc") + test_roc.Write("Test_roc") + + __hist_out.Close() + + + return self.__train_best_thr, self.__test_best_thr + + + def apply_threshold(self): + cut_train = self.__train_best_thr + self.__train_pred = ((self.__bst_train['xgb_preds']>cut_train)*1) + + cut_test = self.__test_best_thr + self.__test_pred = ((self.__bst_test['xgb_preds']>cut_test)*1) + + return self.__train_pred, self.__test_pred + + + def get_result(self, x_train, x_test): + + train_with_preds = x_train.copy() + train_with_preds['xgb_preds1'] = self.__train_pred.values + + + test_with_preds = x_test.copy() + test_with_preds['xgb_preds1'] = self.__test_pred.values + + return train_with_preds, test_with_preds + + + def features_importance(self, bst): + # this one needs to be tested + ax = xgb.plot_importance(bst) + plt.rcParams['figure.figsize'] = [6, 3] + ax.figure.tight_layout() + ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") + + + def CM_plot_train_test(self): + """ + Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to + the number of observations known to be in group i and predicted to be in + group j. Thus in binary classification, the count of true positives is C00, + false negatives C01,false positives is C10, and true neagtives is C11. + + Confusion matrix is applied to previously unseen by model data, so we can + estimate model's performance + + Parameters + ---------- + test_best: numpy.float32 + best threshold + + x_train: dataframe + we want to get confusion matrix on training datasets + """ + #lets take the best threshold and look at the confusion matrix + + cnf_matrix_train = confusion_matrix(self.__bst_train['issignal'], self.__train_pred, labels=[1,0]) + np.set_printoptions(precision=2) + fig_train, axs_train = plt.subplots(figsize=(10, 8)) + axs_train.yaxis.set_label_coords(-0.04,.5) + axs_train.xaxis.set_label_coords(0.5,-.005) + plot_confusion_matrix(cnf_matrix_train, classes=['signal','background'], + title=' Train Dataset Confusion Matrix for XGB for cut > '+str(self.__train_best_thr)) + plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_train.png') + + + cnf_matrix_test = confusion_matrix(self.__bst_test['issignal'], self.__test_pred, labels=[1,0]) + np.set_printoptions(precision=2) + fig_test, axs_test = plt.subplots(figsize=(10, 8)) + axs_test.yaxis.set_label_coords(-0.04,.5) + axs_test.xaxis.set_label_coords(0.5,-.005) + plot_confusion_matrix(cnf_matrix_test, classes=['signal','background'], + title=' Test Dataset Confusion Matrix for XGB for cut > '+str(self.__test_best_thr)) + plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') + + + + def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): + fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) + + + if sign ==0: + s_label = 'Background' + m = 5 + + if sign==1: + s_label = 'Signal' + m = 1 + + axs[0].set_aspect(aspect = 'auto') + axs[1].set_aspect(aspect = 'auto') + axs[2].set_aspect(aspect = 'auto') + + rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) + saved = round((df_cut.shape[0] / df_orig.shape[0]) * 100, 5) + diff = df_orig.shape[0] - df_cut.shape[0] + axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14) + axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14) + axs[2].legend(shadow=True, title ='ML cut saves \n'+ str(saved) +'% of '+ s_label, fontsize =14) + + + + counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16) + axs[0].set_xlabel('rapidity', fontsize=15) + axs[0].set_ylabel('pT, GeV', fontsize=15) + + + mpl.pyplot.colorbar(im0, ax = axs[0]) + + + + axs[0].xaxis.set_major_locator(MultipleLocator(1)) + axs[0].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[0].xaxis.set_tick_params(which='both', width=2) + + + fig.tight_layout() + + + counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut['rapidity'], df_cut['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16) + axs[1].set_xlabel('rapidity', fontsize=15) + axs[1].set_ylabel('pT, GeV', fontsize=15) + + mpl.pyplot.colorbar(im1, ax = axs[1]) + + axs[1].xaxis.set_major_locator(MultipleLocator(1)) + axs[1].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[1].xaxis.set_tick_params(which='both', width=2) + + fig.tight_layout() + + + counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[2].set_title(s_label + ' difference ', fontsize = 16) + axs[2].set_xlabel('rapidity', fontsize=15) + axs[2].set_ylabel('pT, GeV', fontsize=15) + + mpl.pyplot.colorbar(im1, ax = axs[2]) + + + axs[2].xaxis.set_major_locator(MultipleLocator(1)) + axs[2].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[2].xaxis.set_tick_params(which='both', width=2) + + fig.tight_layout() + + fig.savefig(self.output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') + + + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + __hist_out.cd() + ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') + + + rapidity_orig = array('d', df_orig['rapidity'].values.tolist()) + pT_orig = array('d', df_orig['pT'].values.tolist()) + pT_rap_before_cut = ROOT.TH2D( 'pT_rap_before_ML'+data_name, 'pT_rap_before_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_orig)): + pT_rap_before_cut.Fill(rapidity_orig[i], pT_orig[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_before_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_before_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_before_cut.Draw('COLZ') + + + + + + rapidity_cut = array('d', df_cut['rapidity'].values.tolist()) + pT_cut = array('d', df_cut['pT'].values.tolist()) + + pT_rap_cut = ROOT.TH2D( 'pT_rap_after_ML_'+data_name, 'pT_rap_after_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_cut)): + pT_rap_cut.Fill(rapidity_cut[i], pT_cut[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_cut.Draw('COLZ') + + + + + + rapidity_diff = array('d', difference['rapidity'].values.tolist()) + pT_diff = array('d', difference['pT'].values.tolist()) + + pT_rap_diff = ROOT.TH2D('pT_rap_diff_'+data_name, 'pT_rap_diff_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_diff)): + pT_rap_diff.Fill(rapidity_diff[i], pT_diff[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_diff.GetXaxis().SetTitle('rapidity'); + pT_rap_diff.GetYaxis().SetTitle('pT, GeV'); + pT_rap_diff.Draw('COLZ') + + + pT_rap_before_cut.Write() + pT_rap_cut.Write() + pT_rap_diff.Write() + + __hist_out.Close() + + + + def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): + """ + Applied quality cuts and created distributions for all the features in pdf + file + Parameters + ---------- + df_s: dataframe + signal + df_b: dataframe + background + feature: str + name of the feature to be plotted + pdf_key: PdfPages object + name of pdf document with distributions + """ + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + __hist_out.cd() + + + for feature in dfs_orig.columns: + fig, ax = plt.subplots(3, figsize=(20, 10)) + + + fontP = FontProperties() + fontP.set_size('xx-large') + + ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') + ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + + + '\n S samples: '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0]) + + '\nquality cuts ', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[0].xaxis.set_tick_params(labelsize=15) + ax[0].yaxis.set_tick_params(labelsize=15) + + ax[0].set_title(str(feature) + ' MC '+ sample + ' before ML cut', fontsize = 25) + ax[0].set_xlabel(feature, fontsize = 25) + + if feature!=mass_var: + ax[0].set_yscale('log') + + fig.tight_layout() + + + ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') + ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + + '\n S samples: '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) + + '\nquality cuts + ML cut', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + + ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[1].xaxis.set_tick_params(labelsize=15) + ax[1].yaxis.set_tick_params(labelsize=15) + + ax[1].set_title(feature + ' MC '+ sample+ ' after ML cut', fontsize = 25) + ax[1].set_xlabel(feature, fontsize = 25) + + if feature!='mass': + ax[1].set_yscale('log') + + fig.tight_layout() + + + + + ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[2].legend(shadow=True,title ='S samples: '+str(len(difference_s)) +'\nsignal difference', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + + ax[2].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[2].xaxis.set_tick_params(labelsize=15) + ax[2].yaxis.set_tick_params(labelsize=15) + + ax[2].set_title(feature + ' MC '+ sample +' signal difference', fontsize = 25) + ax[2].set_xlabel(feature, fontsize = 25) + + if feature!=mass_var: + ax[2].set_yscale('log') + + fig.tight_layout() + + plt.savefig(pdf_key,format='pdf') + + dfs_orig_feat = array('d', dfs_orig[feature].values.tolist()) + dfb_orig_feat = array('d', dfb_orig[feature].values.tolist()) + + + dfs_cut_feat = array('d', dfs_cut[feature].values.tolist()) + dfb_cut_feat = array('d', dfb_cut[feature].values.tolist()) + + + dfs_diff_feat = array('d', difference_s[feature].values.tolist()) + + + dfs_orig_root = ROOT.TH1D('signal before ML '+feature, 'signal before ML '+feature, 500, + min(dfs_orig[feature].values.tolist()), max(dfs_orig[feature].values.tolist())) + + for i in range(len(dfs_orig_feat)): + dfs_orig_root.Fill(dfs_orig_feat[i]) + + dfs_orig_root.GetXaxis().SetTitle(feature); + + dfs_orig_root.Draw() + + dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, + min(dfb_orig[feature].values.tolist()), max(dfb_orig[feature].values.tolist())) + + for i in range(len(dfb_orig_feat)): + dfb_orig_root.Fill(dfb_orig_feat[i]) + + dfb_orig_root.GetXaxis().SetTitle(feature); + dfb_orig_root.Draw() + + + dfs_cut_root = ROOT.TH1D('signal after ML '+feature, 'signal after ML '+feature, 500, + min(dfs_cut[feature].values.tolist()), max(dfs_cut[feature].values.tolist())) + + for i in range(len(dfs_cut_feat)): + dfs_cut_root.Fill(dfs_cut_feat[i]) + + dfs_cut_root.GetXaxis().SetTitle(feature); + dfs_cut_root.Draw() + + + dfb_cut_root = ROOT.TH1D('background after ML '+feature, 'background after ML '+feature, 500, + min(dfb_cut[feature].values.tolist()), max(dfb_cut[feature].values.tolist())) + + for i in range(len(dfb_cut_feat)): + dfb_cut_root.Fill(dfb_cut_feat[i]) + + dfb_cut_root.GetXaxis().SetTitle(feature); + dfb_cut_root.Draw() + + + dfs_diff_root = ROOT.TH1D('signal difference '+feature, 'signal difference '+feature, 500, + min(difference_s[feature].values.tolist()), max(difference_s[feature].values.tolist())) + + for i in range(len(dfs_diff_feat)): + dfs_diff_root.Fill(dfs_diff_feat[i]) + + dfs_diff_root.GetXaxis().SetTitle(feature); + dfs_diff_root.Draw() + + + __hist_out.cd() + + __hist_out.cd('Signal'+'/'+sample+'/'+'hists') + dfs_orig_root.Write() + dfs_cut_root.Write() + dfs_diff_root.Write() + + __hist_out.cd() + + __hist_out.cd('Background'+'/'+sample+'/'+'hists') + + dfb_orig_root.Write() + dfb_cut_root.Write() + + __hist_out.Close() + pdf_key.close() diff --git a/cand_class/config_reader.py b/cand_class/config_reader.py new file mode 100644 index 0000000..a98e7df --- /dev/null +++ b/cand_class/config_reader.py @@ -0,0 +1,53 @@ +from hipe4ml.tree_handler import TreeHandler +import tomli +import sys + + +def convertDF(input_file, mass_var): + """ + Opens input file in toml format, retrives signal, background and deploy data + like TreeHandler objects + + Parameters + ------------------------------------------------ + df: str + input toml file + """ + with open(str(input_file), encoding="utf-8") as inp_file: + inp_info = tomli.load(inp_file) + + signal = TreeHandler(inp_info["signal"]["path"], inp_info["signal"]["tree"]) + background = TreeHandler(inp_info["background"]["path"], inp_info["background"]["tree"]) + + + + + + bgr_left_edge = inp_info["peak_range"]["bgr_left_edge"] + bgr_right_edge = inp_info["peak_range"]["bgr_right_edge"] + + peak_left_edge = inp_info["peak_range"]["sgn_left_edge"] + peak_right_edge = inp_info["peak_range"]["sgn_right_edge"] + + + selection = str(bgr_left_edge)+'< '+mass_var+' <'+str(peak_left_edge)+' or '+str(peak_right_edge)+\ + '< '+mass_var+' <'+str(bgr_right_edge) + + signalH = signal.get_subset(size = inp_info["number_of_events"]["number_of_signal_events"]) + bkgH = background.get_subset(selection, size=inp_info["number_of_events"]["number_of_background_events"]) + + return signalH, bkgH + + +def read_log_vars(inp_file): + with open(str(inp_file), encoding="utf-8") as inp_file: + inp_dict = tomli.load(inp_file) + + return inp_dict['log_scale']['variables'], inp_dict['log_scale']['variables'] + + + +def read_train_vars(inp_file): + with open(str(inp_file), encoding="utf-8") as inp_file: + inp_dict = tomli.load(inp_file) + return inp_dict['train_vars'] diff --git a/cand_class/helper.py b/cand_class/helper.py new file mode 100644 index 0000000..182c8b6 --- /dev/null +++ b/cand_class/helper.py @@ -0,0 +1,221 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +from config_reader import * + +import xgboost as xgb +from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score +from numpy import sqrt, log, argmax +import itertools +import treelite + + + +def transform_df_to_log(df, vars, inp_file): + """ + Transforms DataFrame to DataFrame with features in log scale + Parameters + ------------------------------------------------ + df: pandas.DataFrame + input DataFrame + vars: list of str + vars to be transformed + inp_file: + config TOML file with list of features that should and shouldn't be + transformed to log scale + """ + df_new = pd.DataFrame() + + non_log_x, log_x = read_log_vars(inp_file) + + + + for var in vars: + if var in log_x: + df_new['log('+ var+')'] = np.log(df[var]) + if var in non_log_x: + df_new[var] = df[var] + return df_new + + +def xgb_matr(x_train, y_train, x_test, y_test, cuts): + """ + To make machine learning algorithms more efficient on unseen data we divide + our data into two sets. One set is for training the algorithm and the other + is for testing the algorithm. If we don't do this then the algorithm can + overfit and we will not capture the general trends in the data. + + Parameters + ---------- + df_scaled: dataframe + dataframe with mixed signal and background + + """ + + dtrain = xgb.DMatrix(x_train[cuts], label = y_train) + dtest=xgb.DMatrix(x_test[cuts], label = y_test) + + + return dtrain, dtest + + +def AMS(y_true, y_predict, y_true1, y_predict1, output_path): + roc_auc=roc_auc_score(y_true, y_predict) + fpr, tpr, thresholds = roc_curve(y_true, y_predict,drop_intermediate=False ,pos_label=1) + + for i in range(len(thresholds)): + if thresholds[i] > 1: + thresholds[i]-=1 + + S0 = sqrt(2 * ((tpr + fpr) * log((1 + tpr/fpr)) - tpr)) + S0 = S0[~np.isnan(S0)] + S0 = S0[~np.isinf(S0)] + xi = argmax(S0) + S0_best_threshold = (thresholds[xi]) + + roc_auc1=roc_auc_score(y_true1, y_predict1) + fpr1, tpr1, thresholds1 = roc_curve(y_true1, y_predict1,drop_intermediate=False ,pos_label=1) + + for i in range(len(thresholds1)): + if thresholds1[i] > 1: + thresholds1[i]-=1 + + S01 = sqrt(2 * ((tpr1 + fpr1) * log((1 + tpr1/fpr1)) - tpr1)) + S01 = S01[~np.isnan(S01)] + S01 = S01[~np.isinf(S01)] + xi1 = argmax(S01) + S0_best_threshold1 = (thresholds[xi1]) + + + fig, ax = plt.subplots(figsize=(12, 8), dpi = 100) + plt.plot(fpr, tpr, linewidth=3 ,linestyle=':',color='darkorange',label='ROC curve train (area = %0.6f)' % roc_auc) + plt.plot(fpr1, tpr1, color='green',label='ROC curve test (area = %0.6f)' % roc_auc1) + plt.plot([0, 1], [0, 1], color='navy', linestyle='--', label='Random guess') + #plt.scatter(fpr[xi], tpr[xi], marker='o', color='black', label= 'Best Threshold train set = '+"%.4f" % S0_best_threshold +'\n AMS = '+ "%.2f" % S0[xi]) + plt.scatter(fpr1[xi1], tpr1[xi1], marker='o', s=80, color='blue', label= 'Best Threshold test set = '+"%.4f" % S0_best_threshold1 +'\n AMS = '+ "%.2f" % S01[xi1]) + plt.xlabel('False Positive Rate', fontsize = 18) + plt.ylabel('True Positive Rate', fontsize = 18) + plt.legend(loc="lower right", fontsize = 18) + plt.title('Receiver operating characteristic', fontsize = 18) + ax.tick_params(axis='both', which='major', labelsize=18) + plt.xlim([-0.01, 1.0]) + plt.ylim([0, 1.02]) + #axs.axis([-0.01, 1, 0.9, 1]) + fig.tight_layout() + fig.savefig(str(output_path)+'/hists.png') + plt.show() + + roc_curve_data = dict() + roc_curve_data["fpr_train"] = fpr + roc_curve_data["tpr_train"] = tpr + + roc_curve_data["fpr_test"] = fpr1 + roc_curve_data["tpr_test"] = tpr1 + + return S0_best_threshold, S0_best_threshold1, roc_curve_data + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Blues): + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title) + plt.colorbar() + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45) + plt.yticks(tick_marks, classes) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.tight_layout() + plt.ylabel('True label',fontsize = 15) + plt.xlabel('Predicted label',fontsize = 15) + + +def preds_prob(df, preds, true, dataset, output_path): + if dataset =='train': + label1 = 'XGB Predictions on the training data set' + else: + label1 = 'XGB Predictions on the test data set' + fig, ax = plt.subplots(figsize=(12, 8)) + bins1=100 + plt.hist(df[preds], bins=bins1,facecolor='green',alpha = 0.3, label=label1) + TP = df[(df[true]==1)] + TN = df[(df[true]==0)] + #TP[preds].plot.hist(ax=ax, bins=bins1,facecolor='blue', histtype='stepfilled',alpha = 0.3, label='True Positives/signal in predictions') + hist, bins = np.histogram(TP[preds], bins=bins1) + err = np.sqrt(hist) + center = (bins[:-1] + bins[1:]) / 2 + + + hist1, bins1 = np.histogram(TN[preds], bins=bins1) + err1 = np.sqrt(hist1) + plt.errorbar(center, hist1, yerr=err1, fmt='o', + c='Red', label='Background in predictions') + + plt.errorbar(center, hist, yerr=err, fmt='o', + c='blue', label='Signal in predictions') + + ax.set_yscale('log') + plt.xlabel('Probability',fontsize=18) + plt.ylabel('Counts', fontsize=18) + plt.legend(fontsize=18) + ax.set_xticks(np.arange(0,1.1,0.1)) + ax.tick_params(axis='both', which='major', labelsize=18) + ax.tick_params(axis='both', which='minor', labelsize=16) + plt.show() + fig.tight_layout() + fig.savefig(str(output_path)+'/test_best_pred.png') + + +def diff_SB(df, signal_label): + dfs = df[df[signal_label]==1] + dfb = df[df[signal_label]==0] + + return dfs, dfb + + +def difference_df(df_orig, df_cut, cut): + return pd.concat([df_orig[cut], df_cut[cut]]).drop_duplicates(keep=False) + + +def diff_SB_cut(df, target_label): + dfs_cut = df[(df['xgb_preds1']==1) & (df[target_label]==1)] + dfb_cut = df[(df['xgb_preds1']==1) & (df[target_label]==0)] + + return dfs_cut, dfb_cut + + + +def save_model_lib(bst_model, output_path): + bst = bst_model.get_booster() + + #create an object out of your model, bst in our case + model = treelite.Model.from_xgboost(bst) + #use GCC compiler + toolchain = 'gcc' + #parallel_comp can be changed upto as many processors as one have + model.export_lib(toolchain=toolchain, libpath=output_path+'/xgb_model.so', + params={'parallel_comp': 4}, verbose=True) + + + # Operating system of the target machine + platform = 'unix' + model.export_srcpkg(platform=platform, toolchain=toolchain, + pkgpath=output_path+'/XGBmodel.zip', libname='xgb_model.so', + verbose=True) diff --git a/cand_class/hipe_conf_params.py b/cand_class/hipe_conf_params.py new file mode 100644 index 0000000..3884f03 --- /dev/null +++ b/cand_class/hipe_conf_params.py @@ -0,0 +1,64 @@ +from hipe4ml.model_handler import ModelHandler +from dataclasses import dataclass +import xgboost as xgb + +import matplotlib.pyplot as plt +from hipe4ml import plot_utils + +import xgboost as xgb + + +@dataclass +class XGBmodel(): + features_for_train: list + hyper_pars_ranges: dict + train_test_data: list + output_path : str + __model_hdl: ModelHandler = (None, None, None) + metrics: str = 'roc_auc' + nfold: int = 3 + init_points: int = 1 + n_iter: int = 2 + n_jobs: int = -1 + + + + + def modelBO(self): + model_clf = xgb.XGBClassifier() + self.__model_hdl = ModelHandler(model_clf, self.features_for_train) + self.__model_hdl.optimize_params_bayes(self.train_test_data, self.hyper_pars_ranges, + self.metrics, self.nfold, self.init_points, self.n_iter, self.n_jobs) + + + + def train_test_pred(self): + self.__model_hdl.train_test_model(self.train_test_data) + + y_pred_train = self.__model_hdl.predict(self.train_test_data[0], False) + y_pred_test = self.__model_hdl.predict(self.train_test_data[2], False) + + + return y_pred_train, y_pred_test + + + def save_predictions(self, filename): + print(self.__model_hdl.get_original_model()) + self.__model_hdl.dump_original_model(self.output_path+'/'+filename, xgb_format=False) + + + def load_model(self, filename): + self.__model_hdl.load_model_handler(filename) + + + def get_mode_booster(self): + return self.__model_hdl.model + + + def plot_dists(self): + + leg_labels = ['background', 'signal'] + ml_out_fig = plot_utils.plot_output_train_test(self.__model_hdl, self.train_test_data, 100, + False, leg_labels, True, density=True) + + plt.savefig(str(self.output_path)+'/thresholds.png') From 6e74feb71d59aeceb79320f82cc8eb7fb3861778 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 25 Oct 2021 21:36:11 +0300 Subject: [PATCH 26/46] Deleted old folder --- CandidatesClassifier0/MLconfig_variables.py | 251 --------- CandidatesClassifier0/apply_model.py | 560 -------------------- CandidatesClassifier0/config_reader.py | 53 -- CandidatesClassifier0/helper.py | 221 -------- CandidatesClassifier0/hipe_conf_params.py | 64 --- 5 files changed, 1149 deletions(-) delete mode 100644 CandidatesClassifier0/MLconfig_variables.py delete mode 100644 CandidatesClassifier0/apply_model.py delete mode 100644 CandidatesClassifier0/config_reader.py delete mode 100644 CandidatesClassifier0/helper.py delete mode 100644 CandidatesClassifier0/hipe_conf_params.py diff --git a/CandidatesClassifier0/MLconfig_variables.py b/CandidatesClassifier0/MLconfig_variables.py deleted file mode 100644 index 9221722..0000000 --- a/CandidatesClassifier0/MLconfig_variables.py +++ /dev/null @@ -1,251 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -from scipy.stats import sem -from scipy.stats import binned_statistic as b_s -import matplotlib as mpl - -from hipe4ml import plot_utils - -def correlation_matrix(bgr, sign, vars_to_draw, leg_labels, output_path): - res_s_b = plot_utils.plot_corr([bgr, sign], vars_to_draw, leg_labels) - res_s_b[0].savefig(output_path+'/'+'corr_matrix_bgr.png') - res_s_b[1].savefig(output_path+'/'+'corr_matrix_sign.png') - - - -def calculate_correlation(df, vars_to_corr, target_var) : - """ - Calculates correlations with target variable variable and standart errors - Parameters - ------------------------------------------------ - df: pandas.DataFrame - imput data - vars_to_corr: list of str - variables that correlate with target value - target_var: str - variable that correlates with another variables mentioned in list - """ - - - mean = df[target_var].mean() - sigma = df[target_var].std() - - correlation = [] - error = [] - - for j in vars_to_corr : - mean_j = df[j].mean() - sigma_j = df[j].std() - - cov = (df[j] - mean_j) * (df[target_var] - mean) / (sigma*sigma_j) - correlation.append(cov.mean()) - error.append(sem(cov)) - - return correlation, error - - -def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, corr_bg, corr_bg_errors, output_path): - """ - Plots correlations - Parameters - ------------------------------------------------ - vars_to_draw: list of str - variables that correlate with target value - var_to_corr: str - variables that correlate with target value - corr_signal: list - signal covariance coefficient between variable and target variable - corr_signal_errors: - signal covariance standart error of the mean - corr_bg: list - background covariance coefficient between variable and target variable - corr_bg_errors: - background covariance standart error of the mean - output_path: - path that contains output plot - """ - - fig, ax = plt.subplots(figsize=(20,10)) - plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='') - plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='') - ax.grid(zorder=0) - ax.set_xticklabels(vars_to_draw, fontsize=30, rotation =70) - # ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) - ax.yaxis.set_tick_params(labelsize=30) - plt.legend(('signal','background'), fontsize = 25) - plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 30) - plt.ylabel('Correlation coefficient', fontsize = 30) - fig.tight_layout() - fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png') - - - -def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): - """ - This function takes the entries of the variables and distributes them in 25 bins. - The function then plots the bin centers of the first variable on the x-axis and - the mean values of the bins of the second variable on the y-axis, along with its bin stds. - Parameters - ------------------------------------------------ - df: pandas.DataFrame - input DataFrame - variable_xaxis: str - variable to be plotted on x axis (invariant mass) - x_unit: str - x axis variable units - variable_yaxis: str - variable to be plotted on y axis - sgn: int(0 or 1) - signal definition(0 background, 1 signal) - pdf_key: matplotlib.backends.backend_pdf.PdfPages - output pdf file - peak: int - invariant mass peak position - edge_left: int - left edge of x axis variable - edge_right: int - left edge of y axis variable - """ - - if sign == 1: - keyword = 'signal' - if sign == 0: - keyword = 'background' - - df = df[(df[variable_xaxis] < edge_right) & (df[variable_xaxis] > edge_left)] - - for var in df.columns: - if var != variable_xaxis: - - fig, axs = plt.subplots(figsize=(20, 15)) - - bin_means, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='mean', bins=25) - bin_std, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='std', bins=25) - bin_count, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='count',bins= 25) - bin_width = (bin_edges[1] - bin_edges[0]) - bin_centers = bin_edges[1:] - bin_width/2 - - nan_ind = np.where(np.isnan(bin_means)) - bin_centers = np.delete(bin_centers, nan_ind) - bin_means = np.delete(bin_means, nan_ind) - bin_count = np.delete(bin_count, nan_ind) - bin_std = np.delete(bin_std , nan_ind) - - - plt.errorbar(x=bin_centers, y=bin_means, yerr=(bin_std/np.sqrt(bin_count)), linestyle='none', marker='.',mfc='red', ms=10) - - - - plt.title('Mean of ' +var+ ' plotted versus bin centers of '+variable_xaxis+ \ - '('+keyword+')', fontsize=25) - plt.xlabel('Mass', fontsize=25) - plt.ylabel("Mean of each bin with the SEM ($\dfrac{bin\ std}{\sqrt{bin\ count}}$) of bin", fontsize=25) - - - plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-') - - axs.xaxis.set_tick_params(labelsize=20) - axs.yaxis.set_tick_params(labelsize=20) - fig.tight_layout() - plt.savefig(pdf_key,format='pdf') - - pdf_key.close() - - -def plot2D_all(df, sample, sgn, pdf_key): - """ - Plots 2D distribution between all the variables - Parameters - ------------------------------------------------ - df: pandas.DataFrame - input dataframe - sample: str - title of the sample - sgn: int(0 or 1) - signal definition(0 background, 1 signal) - pdf_key: matplotlib.backends.backend_pdf.PdfPages - output pdf file - """ - - for xvar in df.columns: - for yvar in df.columns: - if xvar!=yvar: - fig, axs = plt.subplots(figsize=(15, 10)) - cax = plt.hist2d(df[xvar],df[yvar],range=[[df[xvar].min(), df[xvar].max()], [df[yvar].min(), df[yvar].max()]], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) - - - if sgn==1: - plt.title('Signal candidates ' + sample, fontsize = 25) - - if sgn==0: - plt.title('Background candidates ' + sample, fontsize = 25) - - - plt.xlabel(xvar, fontsize=25) - plt.ylabel(yvar, fontsize=25) - - - mpl.pyplot.colorbar() - - plt.legend(shadow=True,title =str(len(df))+ " samples") - - fig.tight_layout() - plt.savefig(pdf_key,format='pdf') - pdf_key.close() - - -def plot2D_mass(df, sample, mass_var, mass_range, sgn, peak, pdf_key): - """ - Plots 2D distribution between variable and invariant mass - Parameters - ------------------------------------------------ - df: pandas.DataFrame - input dataframe - sample: str - title of the sample - mass_var: str - name of the invariant mass variable - mass_range: list - mass range to be plotted - sgn: int(0 or 1) - signal definition(0 background, 1 signal) - peak: int - invariant mass value - pdf_key: matplotlib.backends.backend_pdf.PdfPages - output pdf file - """ - - for var in df.columns: - if var != mass_var: - fig, axs = plt.subplots(figsize=(15, 10)) - cax = plt.hist2d(df[mass_var],df[var],range=[mass_range, [df[var].min(), df[var].max()]], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) - - - if sgn==1: - plt.title('Signal candidates ' + sample, fontsize = 25) - - if sgn==0: - plt.title('Background candidates ' + sample, fontsize = 25) - - - plt.xlabel(mass_var, fontsize=25) - plt.ylabel(var, fontsize=25) - - plt.vlines(x=peak,ymin=df[var].min(),ymax=df[var].max(), color='r', linestyle='-') - - mpl.pyplot.colorbar() - - - axs.xaxis.set_tick_params(labelsize=20) - axs.yaxis.set_tick_params(labelsize=20) - - - plt.legend(shadow=True,title =str(len(df))+ " samples") - - fig.tight_layout() - plt.savefig(pdf_key,format='pdf') - pdf_key.close() diff --git a/CandidatesClassifier0/apply_model.py b/CandidatesClassifier0/apply_model.py deleted file mode 100644 index 3984c20..0000000 --- a/CandidatesClassifier0/apply_model.py +++ /dev/null @@ -1,560 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import xgboost as xgb -from hipe4ml.plot_utils import plot_roc_train_test -from helper import * -from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score - -from dataclasses import dataclass - -from matplotlib.font_manager import FontProperties - -from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, - AutoMinorLocator) - -import gc -import matplotlib as mpl -import ROOT - -from array import array - - -mpl.rc('figure', max_open_warning = 0) - - -@dataclass -class ApplyXGB: - - y_pred_train : np.ndarray - y_pred_test : np.ndarray - - y_train : np.ndarray - y_test : np.ndarray - - output_path : str - - root_output_name = 'hists.root' - - __bst_train : pd = pd.DataFrame() - __bst_test : pd = pd.DataFrame() - - __train_best_thr : int = 0 - __test_best_thr : int = 0 - - - def __post_init__(self): - - __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE") - __hist_out.cd() - - ROOT.gDirectory.mkdir('Signal') - ROOT.gDirectory.mkdir('Background') - - __hist_out.cd() - __hist_out.cd('Signal') - - - ROOT.gDirectory.mkdir('train') - ROOT.gDirectory.mkdir('test') - - __hist_out.cd() - ROOT.gDirectory.cd('Signal/train') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - __hist_out.cd() - ROOT.gDirectory.cd('Signal/test') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - - __hist_out.cd() - __hist_out.cd('Background') - - ROOT.gDirectory.mkdir('train') - ROOT.gDirectory.mkdir('test') - - - __hist_out.cd() - ROOT.gDirectory.cd('Background/train') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - __hist_out.cd() - ROOT.gDirectory.cd('Background/test') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - __hist_out.Close() - - - def apply_predictions(self): - - self.__bst_train["xgb_preds"] = self.y_pred_train - self.__bst_train['issignal'] = self.y_train - - - self.__bst_test["xgb_preds"] = self.y_pred_test - self.__bst_test['issignal'] = self.y_test - - return self.__bst_train, self.__bst_test - - - def get_threshold(self): - - self.__train_best_thr, self.__test_best_thr, roc_curve_data = AMS(self.y_train, - self.__bst_train['xgb_preds'], self.y_test, self.__bst_test['xgb_preds'], - self.output_path) - - __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - - __hist_out.cd() - - fpr = roc_curve_data['fpr_train'] - tpr = roc_curve_data['tpr_train'] - - - fpr1 = roc_curve_data['fpr_test'] - tpr1 = roc_curve_data['tpr_test'] - - - fpr_d_tr = array('d', fpr.tolist()) - tpr_d_tr = array('d', tpr.tolist()) - - fpr_d_ts = array('d', fpr1.tolist()) - tpr_d_ts = array('d', tpr1.tolist()) - - train_roc = ROOT.TGraph(len(fpr_d_tr), fpr_d_tr, tpr_d_tr) - test_roc = ROOT.TGraph(len(fpr_d_ts), fpr_d_ts, tpr_d_ts) - - train_roc.SetLineColor(ROOT.kRed + 2) - test_roc.SetLineColor(ROOT.kBlue + 2) - - - train_roc.SetLineWidth(3) - test_roc.SetLineWidth(3) - - - train_roc.SetLineStyle(9) - test_roc.SetLineStyle(9) - - - train_roc.SetTitle("Receiver operating characteristic train") - test_roc.SetTitle("Receiver operating characteristic test") - - train_roc.GetXaxis().SetTitle('FPR'); - train_roc.GetYaxis().SetTitle('TPR'); - - test_roc.GetXaxis().SetTitle('FPR'); - test_roc.GetYaxis().SetTitle('TPR'); - - train_roc.Write("Train_roc") - test_roc.Write("Test_roc") - - __hist_out.Close() - - - return self.__train_best_thr, self.__test_best_thr - - - def apply_threshold(self): - cut_train = self.__train_best_thr - self.__train_pred = ((self.__bst_train['xgb_preds']>cut_train)*1) - - cut_test = self.__test_best_thr - self.__test_pred = ((self.__bst_test['xgb_preds']>cut_test)*1) - - return self.__train_pred, self.__test_pred - - - def get_result(self, x_train, x_test): - - train_with_preds = x_train.copy() - train_with_preds['xgb_preds1'] = self.__train_pred.values - - - test_with_preds = x_test.copy() - test_with_preds['xgb_preds1'] = self.__test_pred.values - - return train_with_preds, test_with_preds - - - def features_importance(self, bst): - # this one needs to be tested - ax = xgb.plot_importance(bst) - plt.rcParams['figure.figsize'] = [6, 3] - ax.figure.tight_layout() - ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") - - - def CM_plot_train_test(self): - """ - Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to - the number of observations known to be in group i and predicted to be in - group j. Thus in binary classification, the count of true positives is C00, - false negatives C01,false positives is C10, and true neagtives is C11. - - Confusion matrix is applied to previously unseen by model data, so we can - estimate model's performance - - Parameters - ---------- - test_best: numpy.float32 - best threshold - - x_train: dataframe - we want to get confusion matrix on training datasets - """ - #lets take the best threshold and look at the confusion matrix - - cnf_matrix_train = confusion_matrix(self.__bst_train['issignal'], self.__train_pred, labels=[1,0]) - np.set_printoptions(precision=2) - fig_train, axs_train = plt.subplots(figsize=(10, 8)) - axs_train.yaxis.set_label_coords(-0.04,.5) - axs_train.xaxis.set_label_coords(0.5,-.005) - plot_confusion_matrix(cnf_matrix_train, classes=['signal','background'], - title=' Train Dataset Confusion Matrix for XGB for cut > '+str(self.__train_best_thr)) - plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_train.png') - - - cnf_matrix_test = confusion_matrix(self.__bst_test['issignal'], self.__test_pred, labels=[1,0]) - np.set_printoptions(precision=2) - fig_test, axs_test = plt.subplots(figsize=(10, 8)) - axs_test.yaxis.set_label_coords(-0.04,.5) - axs_test.xaxis.set_label_coords(0.5,-.005) - plot_confusion_matrix(cnf_matrix_test, classes=['signal','background'], - title=' Test Dataset Confusion Matrix for XGB for cut > '+str(self.__test_best_thr)) - plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') - - - - def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): - fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) - - - if sign ==0: - s_label = 'Background' - m = 5 - - if sign==1: - s_label = 'Signal' - m = 1 - - axs[0].set_aspect(aspect = 'auto') - axs[1].set_aspect(aspect = 'auto') - axs[2].set_aspect(aspect = 'auto') - - rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) - saved = round((df_cut.shape[0] / df_orig.shape[0]) * 100, 5) - diff = df_orig.shape[0] - df_cut.shape[0] - axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14) - axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14) - axs[2].legend(shadow=True, title ='ML cut saves \n'+ str(saved) +'% of '+ s_label, fontsize =14) - - - - counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16) - axs[0].set_xlabel('rapidity', fontsize=15) - axs[0].set_ylabel('pT, GeV', fontsize=15) - - - mpl.pyplot.colorbar(im0, ax = axs[0]) - - - - axs[0].xaxis.set_major_locator(MultipleLocator(1)) - axs[0].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[0].xaxis.set_tick_params(which='both', width=2) - - - fig.tight_layout() - - - counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut['rapidity'], df_cut['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16) - axs[1].set_xlabel('rapidity', fontsize=15) - axs[1].set_ylabel('pT, GeV', fontsize=15) - - mpl.pyplot.colorbar(im1, ax = axs[1]) - - axs[1].xaxis.set_major_locator(MultipleLocator(1)) - axs[1].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[1].xaxis.set_tick_params(which='both', width=2) - - fig.tight_layout() - - - counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100, - norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) - - axs[2].set_title(s_label + ' difference ', fontsize = 16) - axs[2].set_xlabel('rapidity', fontsize=15) - axs[2].set_ylabel('pT, GeV', fontsize=15) - - mpl.pyplot.colorbar(im1, ax = axs[2]) - - - axs[2].xaxis.set_major_locator(MultipleLocator(1)) - axs[2].xaxis.set_major_formatter(FormatStrFormatter('%d')) - - axs[2].xaxis.set_tick_params(which='both', width=2) - - fig.tight_layout() - - fig.savefig(self.output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') - - - - __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - __hist_out.cd() - ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') - - - rapidity_orig = array('d', df_orig['rapidity'].values.tolist()) - pT_orig = array('d', df_orig['pT'].values.tolist()) - pT_rap_before_cut = ROOT.TH2D( 'pT_rap_before_ML'+data_name, 'pT_rap_before_ML_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_orig)): - pT_rap_before_cut.Fill(rapidity_orig[i], pT_orig[i]) - - ROOT.gStyle.SetOptStat(0) - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_before_cut.GetXaxis().SetTitle('rapidity'); - pT_rap_before_cut.GetYaxis().SetTitle('pT, GeV'); - pT_rap_before_cut.Draw('COLZ') - - - - - - rapidity_cut = array('d', df_cut['rapidity'].values.tolist()) - pT_cut = array('d', df_cut['pT'].values.tolist()) - - pT_rap_cut = ROOT.TH2D( 'pT_rap_after_ML_'+data_name, 'pT_rap_after_ML_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_cut)): - pT_rap_cut.Fill(rapidity_cut[i], pT_cut[i]) - - ROOT.gStyle.SetOptStat(0) - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_cut.GetXaxis().SetTitle('rapidity'); - pT_rap_cut.GetYaxis().SetTitle('pT, GeV'); - pT_rap_cut.Draw('COLZ') - - - - - - rapidity_diff = array('d', difference['rapidity'].values.tolist()) - pT_diff = array('d', difference['pT'].values.tolist()) - - pT_rap_diff = ROOT.TH2D('pT_rap_diff_'+data_name, 'pT_rap_diff_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_diff)): - pT_rap_diff.Fill(rapidity_diff[i], pT_diff[i]) - - ROOT.gStyle.SetOptStat(0) - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_diff.GetXaxis().SetTitle('rapidity'); - pT_rap_diff.GetYaxis().SetTitle('pT, GeV'); - pT_rap_diff.Draw('COLZ') - - - pT_rap_before_cut.Write() - pT_rap_cut.Write() - pT_rap_diff.Write() - - __hist_out.Close() - - - - def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): - """ - Applied quality cuts and created distributions for all the features in pdf - file - Parameters - ---------- - df_s: dataframe - signal - df_b: dataframe - background - feature: str - name of the feature to be plotted - pdf_key: PdfPages object - name of pdf document with distributions - """ - - __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - __hist_out.cd() - - - for feature in dfs_orig.columns: - fig, ax = plt.subplots(3, figsize=(20, 10)) - - - fontP = FontProperties() - fontP.set_size('xx-large') - - ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') - ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + - - '\n S samples: '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0]) + - '\nquality cuts ', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[0].xaxis.set_tick_params(labelsize=15) - ax[0].yaxis.set_tick_params(labelsize=15) - - ax[0].set_title(str(feature) + ' MC '+ sample + ' before ML cut', fontsize = 25) - ax[0].set_xlabel(feature, fontsize = 25) - - if feature!=mass_var: - ax[0].set_yscale('log') - - fig.tight_layout() - - - ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') - ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + - '\n S samples: '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) + - '\nquality cuts + ML cut', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - - ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[1].xaxis.set_tick_params(labelsize=15) - ax[1].yaxis.set_tick_params(labelsize=15) - - ax[1].set_title(feature + ' MC '+ sample+ ' after ML cut', fontsize = 25) - ax[1].set_xlabel(feature, fontsize = 25) - - if feature!='mass': - ax[1].set_yscale('log') - - fig.tight_layout() - - - - - ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') - ax[2].legend(shadow=True,title ='S samples: '+str(len(difference_s)) +'\nsignal difference', - title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), - loc='upper left', prop=fontP,) - - - ax[2].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) - - ax[2].xaxis.set_tick_params(labelsize=15) - ax[2].yaxis.set_tick_params(labelsize=15) - - ax[2].set_title(feature + ' MC '+ sample +' signal difference', fontsize = 25) - ax[2].set_xlabel(feature, fontsize = 25) - - if feature!=mass_var: - ax[2].set_yscale('log') - - fig.tight_layout() - - plt.savefig(pdf_key,format='pdf') - - dfs_orig_feat = array('d', dfs_orig[feature].values.tolist()) - dfb_orig_feat = array('d', dfb_orig[feature].values.tolist()) - - - dfs_cut_feat = array('d', dfs_cut[feature].values.tolist()) - dfb_cut_feat = array('d', dfb_cut[feature].values.tolist()) - - - dfs_diff_feat = array('d', difference_s[feature].values.tolist()) - - - dfs_orig_root = ROOT.TH1D('signal before ML '+feature, 'signal before ML '+feature, 500, - min(dfs_orig[feature].values.tolist()), max(dfs_orig[feature].values.tolist())) - - for i in range(len(dfs_orig_feat)): - dfs_orig_root.Fill(dfs_orig_feat[i]) - - dfs_orig_root.GetXaxis().SetTitle(feature); - - dfs_orig_root.Draw() - - dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, - min(dfb_orig[feature].values.tolist()), max(dfb_orig[feature].values.tolist())) - - for i in range(len(dfb_orig_feat)): - dfb_orig_root.Fill(dfb_orig_feat[i]) - - dfb_orig_root.GetXaxis().SetTitle(feature); - dfb_orig_root.Draw() - - - dfs_cut_root = ROOT.TH1D('signal after ML '+feature, 'signal after ML '+feature, 500, - min(dfs_cut[feature].values.tolist()), max(dfs_cut[feature].values.tolist())) - - for i in range(len(dfs_cut_feat)): - dfs_cut_root.Fill(dfs_cut_feat[i]) - - dfs_cut_root.GetXaxis().SetTitle(feature); - dfs_cut_root.Draw() - - - dfb_cut_root = ROOT.TH1D('background after ML '+feature, 'background after ML '+feature, 500, - min(dfb_cut[feature].values.tolist()), max(dfb_cut[feature].values.tolist())) - - for i in range(len(dfb_cut_feat)): - dfb_cut_root.Fill(dfb_cut_feat[i]) - - dfb_cut_root.GetXaxis().SetTitle(feature); - dfb_cut_root.Draw() - - - dfs_diff_root = ROOT.TH1D('signal difference '+feature, 'signal difference '+feature, 500, - min(difference_s[feature].values.tolist()), max(difference_s[feature].values.tolist())) - - for i in range(len(dfs_diff_feat)): - dfs_diff_root.Fill(dfs_diff_feat[i]) - - dfs_diff_root.GetXaxis().SetTitle(feature); - dfs_diff_root.Draw() - - - __hist_out.cd() - - __hist_out.cd('Signal'+'/'+sample+'/'+'hists') - dfs_orig_root.Write() - dfs_cut_root.Write() - dfs_diff_root.Write() - - __hist_out.cd() - - __hist_out.cd('Background'+'/'+sample+'/'+'hists') - - dfb_orig_root.Write() - dfb_cut_root.Write() - - __hist_out.Close() - pdf_key.close() diff --git a/CandidatesClassifier0/config_reader.py b/CandidatesClassifier0/config_reader.py deleted file mode 100644 index a98e7df..0000000 --- a/CandidatesClassifier0/config_reader.py +++ /dev/null @@ -1,53 +0,0 @@ -from hipe4ml.tree_handler import TreeHandler -import tomli -import sys - - -def convertDF(input_file, mass_var): - """ - Opens input file in toml format, retrives signal, background and deploy data - like TreeHandler objects - - Parameters - ------------------------------------------------ - df: str - input toml file - """ - with open(str(input_file), encoding="utf-8") as inp_file: - inp_info = tomli.load(inp_file) - - signal = TreeHandler(inp_info["signal"]["path"], inp_info["signal"]["tree"]) - background = TreeHandler(inp_info["background"]["path"], inp_info["background"]["tree"]) - - - - - - bgr_left_edge = inp_info["peak_range"]["bgr_left_edge"] - bgr_right_edge = inp_info["peak_range"]["bgr_right_edge"] - - peak_left_edge = inp_info["peak_range"]["sgn_left_edge"] - peak_right_edge = inp_info["peak_range"]["sgn_right_edge"] - - - selection = str(bgr_left_edge)+'< '+mass_var+' <'+str(peak_left_edge)+' or '+str(peak_right_edge)+\ - '< '+mass_var+' <'+str(bgr_right_edge) - - signalH = signal.get_subset(size = inp_info["number_of_events"]["number_of_signal_events"]) - bkgH = background.get_subset(selection, size=inp_info["number_of_events"]["number_of_background_events"]) - - return signalH, bkgH - - -def read_log_vars(inp_file): - with open(str(inp_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - - return inp_dict['log_scale']['variables'], inp_dict['log_scale']['variables'] - - - -def read_train_vars(inp_file): - with open(str(inp_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - return inp_dict['train_vars'] diff --git a/CandidatesClassifier0/helper.py b/CandidatesClassifier0/helper.py deleted file mode 100644 index 182c8b6..0000000 --- a/CandidatesClassifier0/helper.py +++ /dev/null @@ -1,221 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -from config_reader import * - -import xgboost as xgb -from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score -from numpy import sqrt, log, argmax -import itertools -import treelite - - - -def transform_df_to_log(df, vars, inp_file): - """ - Transforms DataFrame to DataFrame with features in log scale - Parameters - ------------------------------------------------ - df: pandas.DataFrame - input DataFrame - vars: list of str - vars to be transformed - inp_file: - config TOML file with list of features that should and shouldn't be - transformed to log scale - """ - df_new = pd.DataFrame() - - non_log_x, log_x = read_log_vars(inp_file) - - - - for var in vars: - if var in log_x: - df_new['log('+ var+')'] = np.log(df[var]) - if var in non_log_x: - df_new[var] = df[var] - return df_new - - -def xgb_matr(x_train, y_train, x_test, y_test, cuts): - """ - To make machine learning algorithms more efficient on unseen data we divide - our data into two sets. One set is for training the algorithm and the other - is for testing the algorithm. If we don't do this then the algorithm can - overfit and we will not capture the general trends in the data. - - Parameters - ---------- - df_scaled: dataframe - dataframe with mixed signal and background - - """ - - dtrain = xgb.DMatrix(x_train[cuts], label = y_train) - dtest=xgb.DMatrix(x_test[cuts], label = y_test) - - - return dtrain, dtest - - -def AMS(y_true, y_predict, y_true1, y_predict1, output_path): - roc_auc=roc_auc_score(y_true, y_predict) - fpr, tpr, thresholds = roc_curve(y_true, y_predict,drop_intermediate=False ,pos_label=1) - - for i in range(len(thresholds)): - if thresholds[i] > 1: - thresholds[i]-=1 - - S0 = sqrt(2 * ((tpr + fpr) * log((1 + tpr/fpr)) - tpr)) - S0 = S0[~np.isnan(S0)] - S0 = S0[~np.isinf(S0)] - xi = argmax(S0) - S0_best_threshold = (thresholds[xi]) - - roc_auc1=roc_auc_score(y_true1, y_predict1) - fpr1, tpr1, thresholds1 = roc_curve(y_true1, y_predict1,drop_intermediate=False ,pos_label=1) - - for i in range(len(thresholds1)): - if thresholds1[i] > 1: - thresholds1[i]-=1 - - S01 = sqrt(2 * ((tpr1 + fpr1) * log((1 + tpr1/fpr1)) - tpr1)) - S01 = S01[~np.isnan(S01)] - S01 = S01[~np.isinf(S01)] - xi1 = argmax(S01) - S0_best_threshold1 = (thresholds[xi1]) - - - fig, ax = plt.subplots(figsize=(12, 8), dpi = 100) - plt.plot(fpr, tpr, linewidth=3 ,linestyle=':',color='darkorange',label='ROC curve train (area = %0.6f)' % roc_auc) - plt.plot(fpr1, tpr1, color='green',label='ROC curve test (area = %0.6f)' % roc_auc1) - plt.plot([0, 1], [0, 1], color='navy', linestyle='--', label='Random guess') - #plt.scatter(fpr[xi], tpr[xi], marker='o', color='black', label= 'Best Threshold train set = '+"%.4f" % S0_best_threshold +'\n AMS = '+ "%.2f" % S0[xi]) - plt.scatter(fpr1[xi1], tpr1[xi1], marker='o', s=80, color='blue', label= 'Best Threshold test set = '+"%.4f" % S0_best_threshold1 +'\n AMS = '+ "%.2f" % S01[xi1]) - plt.xlabel('False Positive Rate', fontsize = 18) - plt.ylabel('True Positive Rate', fontsize = 18) - plt.legend(loc="lower right", fontsize = 18) - plt.title('Receiver operating characteristic', fontsize = 18) - ax.tick_params(axis='both', which='major', labelsize=18) - plt.xlim([-0.01, 1.0]) - plt.ylim([0, 1.02]) - #axs.axis([-0.01, 1, 0.9, 1]) - fig.tight_layout() - fig.savefig(str(output_path)+'/hists.png') - plt.show() - - roc_curve_data = dict() - roc_curve_data["fpr_train"] = fpr - roc_curve_data["tpr_train"] = tpr - - roc_curve_data["fpr_test"] = fpr1 - roc_curve_data["tpr_test"] = tpr1 - - return S0_best_threshold, S0_best_threshold1, roc_curve_data - - -def plot_confusion_matrix(cm, classes, - normalize=False, - title='Confusion matrix', - cmap=plt.cm.Blues): - if normalize: - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - print("Normalized confusion matrix") - else: - print('Confusion matrix, without normalization') - - print(cm) - - plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) - plt.colorbar() - tick_marks = np.arange(len(classes)) - plt.xticks(tick_marks, classes, rotation=45) - plt.yticks(tick_marks, classes) - - fmt = '.2f' if normalize else 'd' - thresh = cm.max() / 2. - for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): - plt.text(j, i, format(cm[i, j], fmt), - horizontalalignment="center", - color="white" if cm[i, j] > thresh else "black") - - plt.tight_layout() - plt.ylabel('True label',fontsize = 15) - plt.xlabel('Predicted label',fontsize = 15) - - -def preds_prob(df, preds, true, dataset, output_path): - if dataset =='train': - label1 = 'XGB Predictions on the training data set' - else: - label1 = 'XGB Predictions on the test data set' - fig, ax = plt.subplots(figsize=(12, 8)) - bins1=100 - plt.hist(df[preds], bins=bins1,facecolor='green',alpha = 0.3, label=label1) - TP = df[(df[true]==1)] - TN = df[(df[true]==0)] - #TP[preds].plot.hist(ax=ax, bins=bins1,facecolor='blue', histtype='stepfilled',alpha = 0.3, label='True Positives/signal in predictions') - hist, bins = np.histogram(TP[preds], bins=bins1) - err = np.sqrt(hist) - center = (bins[:-1] + bins[1:]) / 2 - - - hist1, bins1 = np.histogram(TN[preds], bins=bins1) - err1 = np.sqrt(hist1) - plt.errorbar(center, hist1, yerr=err1, fmt='o', - c='Red', label='Background in predictions') - - plt.errorbar(center, hist, yerr=err, fmt='o', - c='blue', label='Signal in predictions') - - ax.set_yscale('log') - plt.xlabel('Probability',fontsize=18) - plt.ylabel('Counts', fontsize=18) - plt.legend(fontsize=18) - ax.set_xticks(np.arange(0,1.1,0.1)) - ax.tick_params(axis='both', which='major', labelsize=18) - ax.tick_params(axis='both', which='minor', labelsize=16) - plt.show() - fig.tight_layout() - fig.savefig(str(output_path)+'/test_best_pred.png') - - -def diff_SB(df, signal_label): - dfs = df[df[signal_label]==1] - dfb = df[df[signal_label]==0] - - return dfs, dfb - - -def difference_df(df_orig, df_cut, cut): - return pd.concat([df_orig[cut], df_cut[cut]]).drop_duplicates(keep=False) - - -def diff_SB_cut(df, target_label): - dfs_cut = df[(df['xgb_preds1']==1) & (df[target_label]==1)] - dfb_cut = df[(df['xgb_preds1']==1) & (df[target_label]==0)] - - return dfs_cut, dfb_cut - - - -def save_model_lib(bst_model, output_path): - bst = bst_model.get_booster() - - #create an object out of your model, bst in our case - model = treelite.Model.from_xgboost(bst) - #use GCC compiler - toolchain = 'gcc' - #parallel_comp can be changed upto as many processors as one have - model.export_lib(toolchain=toolchain, libpath=output_path+'/xgb_model.so', - params={'parallel_comp': 4}, verbose=True) - - - # Operating system of the target machine - platform = 'unix' - model.export_srcpkg(platform=platform, toolchain=toolchain, - pkgpath=output_path+'/XGBmodel.zip', libname='xgb_model.so', - verbose=True) diff --git a/CandidatesClassifier0/hipe_conf_params.py b/CandidatesClassifier0/hipe_conf_params.py deleted file mode 100644 index 3884f03..0000000 --- a/CandidatesClassifier0/hipe_conf_params.py +++ /dev/null @@ -1,64 +0,0 @@ -from hipe4ml.model_handler import ModelHandler -from dataclasses import dataclass -import xgboost as xgb - -import matplotlib.pyplot as plt -from hipe4ml import plot_utils - -import xgboost as xgb - - -@dataclass -class XGBmodel(): - features_for_train: list - hyper_pars_ranges: dict - train_test_data: list - output_path : str - __model_hdl: ModelHandler = (None, None, None) - metrics: str = 'roc_auc' - nfold: int = 3 - init_points: int = 1 - n_iter: int = 2 - n_jobs: int = -1 - - - - - def modelBO(self): - model_clf = xgb.XGBClassifier() - self.__model_hdl = ModelHandler(model_clf, self.features_for_train) - self.__model_hdl.optimize_params_bayes(self.train_test_data, self.hyper_pars_ranges, - self.metrics, self.nfold, self.init_points, self.n_iter, self.n_jobs) - - - - def train_test_pred(self): - self.__model_hdl.train_test_model(self.train_test_data) - - y_pred_train = self.__model_hdl.predict(self.train_test_data[0], False) - y_pred_test = self.__model_hdl.predict(self.train_test_data[2], False) - - - return y_pred_train, y_pred_test - - - def save_predictions(self, filename): - print(self.__model_hdl.get_original_model()) - self.__model_hdl.dump_original_model(self.output_path+'/'+filename, xgb_format=False) - - - def load_model(self, filename): - self.__model_hdl.load_model_handler(filename) - - - def get_mode_booster(self): - return self.__model_hdl.model - - - def plot_dists(self): - - leg_labels = ['background', 'signal'] - ml_out_fig = plot_utils.plot_output_train_test(self.__model_hdl, self.train_test_data, 100, - False, leg_labels, True, density=True) - - plt.savefig(str(self.output_path)+'/thresholds.png') From bd9fab234ae7f44c9d67fa6735bf1f30e55468d9 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 25 Oct 2021 21:37:17 +0300 Subject: [PATCH 27/46] Deleted old file --- DFconverter.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 DFconverter.py diff --git a/DFconverter.py b/DFconverter.py deleted file mode 100644 index 5cb2f23..0000000 --- a/DFconverter.py +++ /dev/null @@ -1,22 +0,0 @@ -from hipe4ml.tree_handler import TreeHandler -import tomli -import sys - - -def convertDF(input_file): - """ - Opens input file in toml format, retrives signal, background and deploy data - like TreeHandler objects - - Parameters - ------------------------------------------------ - df: str - input toml file - """ - with open(str(input_file), encoding="utf-8") as inp_file: - inp_dict = tomli.load(inp_file) - signal = TreeHandler(inp_dict["signal"]["path"], inp_dict["signal"]["tree"]) - background = TreeHandler(inp_dict["background"]["path"], inp_dict["background"]["tree"]) - deploy = TreeHandler(inp_dict["deploy"]["path"], inp_dict["deploy"]["tree"]) - - return signal, background, deploy From 140630d9cc405710a71e9884eed11796461a1349 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Tue, 26 Oct 2021 09:36:25 +0300 Subject: [PATCH 28/46] Added requirements --- requirements.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2ff04a1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +uproot +matplotlib +pandas +numpy +scikit-learn +xgboost +bayesian-optimization From dcde05fd99c1520cdcc4b7d2d37fe8808b7bf89a Mon Sep 17 00:00:00 2001 From: conformist89 Date: Tue, 26 Oct 2021 10:04:51 +0300 Subject: [PATCH 29/46] Added hipe4ml to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 2ff04a1..72797d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ numpy scikit-learn xgboost bayesian-optimization +hipe4ml From dd326cb86a4674f16aff2a09369af6dc6225afa5 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Wed, 10 Nov 2021 11:02:36 +0200 Subject: [PATCH 30/46] New module for saving hists as ROOT objects --- cand_class/hists_root.py | 291 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 cand_class/hists_root.py diff --git a/cand_class/hists_root.py b/cand_class/hists_root.py new file mode 100644 index 0000000..9542182 --- /dev/null +++ b/cand_class/hists_root.py @@ -0,0 +1,291 @@ +import ROOT + +from array import array + +from cand_class.helper import * + +from dataclasses import dataclass + +@dataclass +class HistBuilder: + + output_path : str + root_output_name = 'hists.root' + + def __post_init__(self): + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE") + __hist_out.cd() + + ROOT.gDirectory.mkdir('Signal') + ROOT.gDirectory.mkdir('Background') + + __hist_out.cd() + __hist_out.cd('Signal') + + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + __hist_out.cd() + ROOT.gDirectory.cd('Signal/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.cd() + ROOT.gDirectory.cd('Signal/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + + __hist_out.cd() + __hist_out.cd('Background') + + ROOT.gDirectory.mkdir('train') + ROOT.gDirectory.mkdir('test') + + + __hist_out.cd() + ROOT.gDirectory.cd('Background/train') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.cd() + ROOT.gDirectory.cd('Background/test') + ROOT.gDirectory.mkdir('pt_rap') + ROOT.gDirectory.mkdir('hists') + + __hist_out.Close() + + + def roc_curve_root(self): + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + + __hist_out.cd() + + fpr = roc_curve_data['fpr_train'] + tpr = roc_curve_data['tpr_train'] + + + fpr1 = roc_curve_data['fpr_test'] + tpr1 = roc_curve_data['tpr_test'] + + + fpr_d_tr = array('d', fpr.tolist()) + tpr_d_tr = array('d', tpr.tolist()) + + fpr_d_ts = array('d', fpr1.tolist()) + tpr_d_ts = array('d', tpr1.tolist()) + + train_roc = ROOT.TGraph(len(fpr_d_tr), fpr_d_tr, tpr_d_tr) + test_roc = ROOT.TGraph(len(fpr_d_ts), fpr_d_ts, tpr_d_ts) + + train_roc.SetLineColor(ROOT.kRed + 2) + test_roc.SetLineColor(ROOT.kBlue + 2) + + + train_roc.SetLineWidth(3) + test_roc.SetLineWidth(3) + + + train_roc.SetLineStyle(9) + test_roc.SetLineStyle(9) + + + train_roc.SetTitle("Receiver operating characteristic train") + test_roc.SetTitle("Receiver operating characteristic test") + + train_roc.GetXaxis().SetTitle('FPR'); + train_roc.GetYaxis().SetTitle('TPR'); + + test_roc.GetXaxis().SetTitle('FPR'); + test_roc.GetYaxis().SetTitle('TPR'); + + train_roc.Write("Train_roc") + test_roc.Write("Test_roc") + + __hist_out.Close() + + def pt_rap_root(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): + + if sign ==0: + s_label = 'Background' + m = 5 + + if sign==1: + s_label = 'Signal' + m = 1 + + rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) + saved = round((df_cut.shape[0] / df_orig.shape[0]) * 100, 5) + diff = df_orig.shape[0] - df_cut.shape[0] + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + __hist_out.cd() + ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') + + + rapidity_orig = array('d', df_orig['rapidity'].values.tolist()) + pT_orig = array('d', df_orig['pT'].values.tolist()) + pT_rap_before_cut = ROOT.TH2D( 'pT_rap_before_ML'+data_name, 'pT_rap_before_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_orig)): + pT_rap_before_cut.Fill(rapidity_orig[i], pT_orig[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_before_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_before_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_before_cut.Draw('COLZ') + + + + + + rapidity_cut = array('d', df_cut['rapidity'].values.tolist()) + pT_cut = array('d', df_cut['pT'].values.tolist()) + + pT_rap_cut = ROOT.TH2D( 'pT_rap_after_ML_'+data_name, 'pT_rap_after_ML_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_cut)): + pT_rap_cut.Fill(rapidity_cut[i], pT_cut[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_cut.GetXaxis().SetTitle('rapidity'); + pT_rap_cut.GetYaxis().SetTitle('pT, GeV'); + pT_rap_cut.Draw('COLZ') + + + + + + rapidity_diff = array('d', difference['rapidity'].values.tolist()) + pT_diff = array('d', difference['pT'].values.tolist()) + + pT_rap_diff = ROOT.TH2D('pT_rap_diff_'+data_name, 'pT_rap_diff_'+data_name, 100, min(x_range), + max(x_range), 100, min(x_range), max(x_range)) + + + for i in range(len(rapidity_diff)): + pT_rap_diff.Fill(rapidity_diff[i], pT_diff[i]) + + ROOT.gStyle.SetOptStat(0) + ROOT.gStyle.SetPalette(ROOT.kBird) + + pT_rap_diff.GetXaxis().SetTitle('rapidity'); + pT_rap_diff.GetYaxis().SetTitle('pT, GeV'); + pT_rap_diff.Draw('COLZ') + + + pT_rap_before_cut.Write() + pT_rap_cut.Write() + pT_rap_diff.Write() + + __hist_out.Close() + + + def hist_variables_root(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut,difference_s, sample): + """ + Applied quality cuts and created distributions for all the features in pdf + file + Parameters + ---------- + df_s: dataframe + signal + df_b: dataframe + background + feature: str + name of the feature to be plotted + pdf_key: PdfPages object + name of pdf document with distributions + """ + + __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); + __hist_out.cd() + + + for feature in dfs_orig.columns: + + dfs_orig_feat = array('d', dfs_orig[feature].values.tolist()) + dfb_orig_feat = array('d', dfb_orig[feature].values.tolist()) + + + dfs_cut_feat = array('d', dfs_cut[feature].values.tolist()) + dfb_cut_feat = array('d', dfb_cut[feature].values.tolist()) + + + dfs_diff_feat = array('d', difference_s[feature].values.tolist()) + + + dfs_orig_root = ROOT.TH1D('signal before ML '+feature, 'signal before ML '+feature, 500, + min(dfs_orig[feature].values.tolist()), max(dfs_orig[feature].values.tolist())) + + for i in range(len(dfs_orig_feat)): + dfs_orig_root.Fill(dfs_orig_feat[i]) + + dfs_orig_root.GetXaxis().SetTitle(feature); + + dfs_orig_root.Draw() + + dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, + min(dfb_orig[feature].values.tolist()), max(dfb_orig[feature].values.tolist())) + + for i in range(len(dfb_orig_feat)): + dfb_orig_root.Fill(dfb_orig_feat[i]) + + dfb_orig_root.GetXaxis().SetTitle(feature); + dfb_orig_root.Draw() + + + dfs_cut_root = ROOT.TH1D('signal after ML '+feature, 'signal after ML '+feature, 500, + min(dfs_cut[feature].values.tolist()), max(dfs_cut[feature].values.tolist())) + + for i in range(len(dfs_cut_feat)): + dfs_cut_root.Fill(dfs_cut_feat[i]) + + dfs_cut_root.GetXaxis().SetTitle(feature); + dfs_cut_root.Draw() + + + dfb_cut_root = ROOT.TH1D('background after ML '+feature, 'background after ML '+feature, 500, + min(dfb_cut[feature].values.tolist()), max(dfb_cut[feature].values.tolist())) + + for i in range(len(dfb_cut_feat)): + dfb_cut_root.Fill(dfb_cut_feat[i]) + + dfb_cut_root.GetXaxis().SetTitle(feature); + dfb_cut_root.Draw() + + + dfs_diff_root = ROOT.TH1D('signal difference '+feature, 'signal difference '+feature, 500, + min(difference_s[feature].values.tolist()), max(difference_s[feature].values.tolist())) + + for i in range(len(dfs_diff_feat)): + dfs_diff_root.Fill(dfs_diff_feat[i]) + + dfs_diff_root.GetXaxis().SetTitle(feature); + dfs_diff_root.Draw() + + + __hist_out.cd() + + __hist_out.cd('Signal'+'/'+sample+'/'+'hists') + dfs_orig_root.Write() + dfs_cut_root.Write() + dfs_diff_root.Write() + + __hist_out.cd() + + __hist_out.cd('Background'+'/'+sample+'/'+'hists') + + dfb_orig_root.Write() + dfb_cut_root.Write() + + __hist_out.Close() From debe349cf359084b21e5c954dc2dfd2d29b01dbd Mon Sep 17 00:00:00 2001 From: conformist89 Date: Wed, 10 Nov 2021 11:06:14 +0200 Subject: [PATCH 31/46] Chabged argument of transform_df_to_log. Doesn't read config file to retrieve variables that require log scale(and ones that doesn't require, user specifies these variables --- cand_class/helper.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cand_class/helper.py b/cand_class/helper.py index 182c8b6..31a62c4 100644 --- a/cand_class/helper.py +++ b/cand_class/helper.py @@ -2,8 +2,6 @@ import pandas as pd import matplotlib.pyplot as plt -from config_reader import * - import xgboost as xgb from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score from numpy import sqrt, log, argmax @@ -12,7 +10,7 @@ -def transform_df_to_log(df, vars, inp_file): +def transform_df_to_log(df, vars, non_log_x, log_x): """ Transforms DataFrame to DataFrame with features in log scale Parameters @@ -27,8 +25,6 @@ def transform_df_to_log(df, vars, inp_file): """ df_new = pd.DataFrame() - non_log_x, log_x = read_log_vars(inp_file) - for var in vars: From 399394f61eea321391131f54e3600965bc853fa9 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Wed, 10 Nov 2021 11:09:46 +0200 Subject: [PATCH 32/46] Deleted part with saving hists as ROOT objects(moved to separate hists_root module --- cand_class/apply_model.py | 248 +------------------------------------- 1 file changed, 1 insertion(+), 247 deletions(-) diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py index 3984c20..6cff948 100644 --- a/cand_class/apply_model.py +++ b/cand_class/apply_model.py @@ -3,7 +3,7 @@ import matplotlib.pyplot as plt import xgboost as xgb from hipe4ml.plot_utils import plot_roc_train_test -from helper import * +from cand_class.helper import * from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score from dataclasses import dataclass @@ -15,9 +15,6 @@ import gc import matplotlib as mpl -import ROOT - -from array import array mpl.rc('figure', max_open_warning = 0) @@ -34,7 +31,6 @@ class ApplyXGB: output_path : str - root_output_name = 'hists.root' __bst_train : pd = pd.DataFrame() __bst_test : pd = pd.DataFrame() @@ -42,53 +38,6 @@ class ApplyXGB: __train_best_thr : int = 0 __test_best_thr : int = 0 - - def __post_init__(self): - - __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE") - __hist_out.cd() - - ROOT.gDirectory.mkdir('Signal') - ROOT.gDirectory.mkdir('Background') - - __hist_out.cd() - __hist_out.cd('Signal') - - - ROOT.gDirectory.mkdir('train') - ROOT.gDirectory.mkdir('test') - - __hist_out.cd() - ROOT.gDirectory.cd('Signal/train') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - __hist_out.cd() - ROOT.gDirectory.cd('Signal/test') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - - __hist_out.cd() - __hist_out.cd('Background') - - ROOT.gDirectory.mkdir('train') - ROOT.gDirectory.mkdir('test') - - - __hist_out.cd() - ROOT.gDirectory.cd('Background/train') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - __hist_out.cd() - ROOT.gDirectory.cd('Background/test') - ROOT.gDirectory.mkdir('pt_rap') - ROOT.gDirectory.mkdir('hists') - - __hist_out.Close() - - def apply_predictions(self): self.__bst_train["xgb_preds"] = self.y_pred_train @@ -107,53 +56,6 @@ def get_threshold(self): self.__bst_train['xgb_preds'], self.y_test, self.__bst_test['xgb_preds'], self.output_path) - __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - - __hist_out.cd() - - fpr = roc_curve_data['fpr_train'] - tpr = roc_curve_data['tpr_train'] - - - fpr1 = roc_curve_data['fpr_test'] - tpr1 = roc_curve_data['tpr_test'] - - - fpr_d_tr = array('d', fpr.tolist()) - tpr_d_tr = array('d', tpr.tolist()) - - fpr_d_ts = array('d', fpr1.tolist()) - tpr_d_ts = array('d', tpr1.tolist()) - - train_roc = ROOT.TGraph(len(fpr_d_tr), fpr_d_tr, tpr_d_tr) - test_roc = ROOT.TGraph(len(fpr_d_ts), fpr_d_ts, tpr_d_ts) - - train_roc.SetLineColor(ROOT.kRed + 2) - test_roc.SetLineColor(ROOT.kBlue + 2) - - - train_roc.SetLineWidth(3) - test_roc.SetLineWidth(3) - - - train_roc.SetLineStyle(9) - test_roc.SetLineStyle(9) - - - train_roc.SetTitle("Receiver operating characteristic train") - test_roc.SetTitle("Receiver operating characteristic test") - - train_roc.GetXaxis().SetTitle('FPR'); - train_roc.GetYaxis().SetTitle('TPR'); - - test_roc.GetXaxis().SetTitle('FPR'); - test_roc.GetYaxis().SetTitle('TPR'); - - train_roc.Write("Train_roc") - test_roc.Write("Test_roc") - - __hist_out.Close() - return self.__train_best_thr, self.__test_best_thr @@ -313,77 +215,6 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da - __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - __hist_out.cd() - ROOT.gDirectory.cd(s_label+'/'+data_name+'/'+'pt_rap') - - - rapidity_orig = array('d', df_orig['rapidity'].values.tolist()) - pT_orig = array('d', df_orig['pT'].values.tolist()) - pT_rap_before_cut = ROOT.TH2D( 'pT_rap_before_ML'+data_name, 'pT_rap_before_ML_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_orig)): - pT_rap_before_cut.Fill(rapidity_orig[i], pT_orig[i]) - - ROOT.gStyle.SetOptStat(0) - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_before_cut.GetXaxis().SetTitle('rapidity'); - pT_rap_before_cut.GetYaxis().SetTitle('pT, GeV'); - pT_rap_before_cut.Draw('COLZ') - - - - - - rapidity_cut = array('d', df_cut['rapidity'].values.tolist()) - pT_cut = array('d', df_cut['pT'].values.tolist()) - - pT_rap_cut = ROOT.TH2D( 'pT_rap_after_ML_'+data_name, 'pT_rap_after_ML_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_cut)): - pT_rap_cut.Fill(rapidity_cut[i], pT_cut[i]) - - ROOT.gStyle.SetOptStat(0) - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_cut.GetXaxis().SetTitle('rapidity'); - pT_rap_cut.GetYaxis().SetTitle('pT, GeV'); - pT_rap_cut.Draw('COLZ') - - - - - - rapidity_diff = array('d', difference['rapidity'].values.tolist()) - pT_diff = array('d', difference['pT'].values.tolist()) - - pT_rap_diff = ROOT.TH2D('pT_rap_diff_'+data_name, 'pT_rap_diff_'+data_name, 100, min(x_range), - max(x_range), 100, min(x_range), max(x_range)) - - - for i in range(len(rapidity_diff)): - pT_rap_diff.Fill(rapidity_diff[i], pT_diff[i]) - - ROOT.gStyle.SetOptStat(0) - ROOT.gStyle.SetPalette(ROOT.kBird) - - pT_rap_diff.GetXaxis().SetTitle('rapidity'); - pT_rap_diff.GetYaxis().SetTitle('pT, GeV'); - pT_rap_diff.Draw('COLZ') - - - pT_rap_before_cut.Write() - pT_rap_cut.Write() - pT_rap_diff.Write() - - __hist_out.Close() - - def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): """ @@ -401,8 +232,6 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe name of pdf document with distributions """ - __hist_out = ROOT.TFile(self.output_path+'/'+self.root_output_name, "UPDATE"); - __hist_out.cd() for feature in dfs_orig.columns: @@ -481,80 +310,5 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe plt.savefig(pdf_key,format='pdf') - dfs_orig_feat = array('d', dfs_orig[feature].values.tolist()) - dfb_orig_feat = array('d', dfb_orig[feature].values.tolist()) - - - dfs_cut_feat = array('d', dfs_cut[feature].values.tolist()) - dfb_cut_feat = array('d', dfb_cut[feature].values.tolist()) - - - dfs_diff_feat = array('d', difference_s[feature].values.tolist()) - - - dfs_orig_root = ROOT.TH1D('signal before ML '+feature, 'signal before ML '+feature, 500, - min(dfs_orig[feature].values.tolist()), max(dfs_orig[feature].values.tolist())) - - for i in range(len(dfs_orig_feat)): - dfs_orig_root.Fill(dfs_orig_feat[i]) - - dfs_orig_root.GetXaxis().SetTitle(feature); - - dfs_orig_root.Draw() - - dfb_orig_root = ROOT.TH1D('background before ML '+feature, 'background before ML '+feature, 500, - min(dfb_orig[feature].values.tolist()), max(dfb_orig[feature].values.tolist())) - - for i in range(len(dfb_orig_feat)): - dfb_orig_root.Fill(dfb_orig_feat[i]) - - dfb_orig_root.GetXaxis().SetTitle(feature); - dfb_orig_root.Draw() - - - dfs_cut_root = ROOT.TH1D('signal after ML '+feature, 'signal after ML '+feature, 500, - min(dfs_cut[feature].values.tolist()), max(dfs_cut[feature].values.tolist())) - - for i in range(len(dfs_cut_feat)): - dfs_cut_root.Fill(dfs_cut_feat[i]) - - dfs_cut_root.GetXaxis().SetTitle(feature); - dfs_cut_root.Draw() - - - dfb_cut_root = ROOT.TH1D('background after ML '+feature, 'background after ML '+feature, 500, - min(dfb_cut[feature].values.tolist()), max(dfb_cut[feature].values.tolist())) - - for i in range(len(dfb_cut_feat)): - dfb_cut_root.Fill(dfb_cut_feat[i]) - - dfb_cut_root.GetXaxis().SetTitle(feature); - dfb_cut_root.Draw() - - - dfs_diff_root = ROOT.TH1D('signal difference '+feature, 'signal difference '+feature, 500, - min(difference_s[feature].values.tolist()), max(difference_s[feature].values.tolist())) - - for i in range(len(dfs_diff_feat)): - dfs_diff_root.Fill(dfs_diff_feat[i]) - - dfs_diff_root.GetXaxis().SetTitle(feature); - dfs_diff_root.Draw() - - - __hist_out.cd() - - __hist_out.cd('Signal'+'/'+sample+'/'+'hists') - dfs_orig_root.Write() - dfs_cut_root.Write() - dfs_diff_root.Write() - - __hist_out.cd() - - __hist_out.cd('Background'+'/'+sample+'/'+'hists') - - dfb_orig_root.Write() - dfb_cut_root.Write() - __hist_out.Close() pdf_key.close() From 33dde1fd1978f2a6675641a5037af8b5c456aa01 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Wed, 10 Nov 2021 12:17:15 +0200 Subject: [PATCH 33/46] Added treelite(to convert predictions to C++ libraty) to dependencies --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 72797d4..c54891a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ scikit-learn xgboost bayesian-optimization hipe4ml +treelite +treelite_runtime From bd1481bd6150c3c7106292d5cd1011558ba3b239 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Tue, 16 Nov 2021 11:20:13 +0200 Subject: [PATCH 34/46] Matplotlib was deleted from requirements --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c54891a..91f39c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ uproot -matplotlib pandas numpy scikit-learn From 6cd450f2e4bab32535d173ef2e66ab1b92c13893 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 22 Nov 2021 11:34:26 +0200 Subject: [PATCH 35/46] Created one method to compute threshold and apply it to dataset --- cand_class/apply_model.py | 83 +++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 48 deletions(-) diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py index 6cff948..ee67826 100644 --- a/cand_class/apply_model.py +++ b/cand_class/apply_model.py @@ -15,7 +15,7 @@ import gc import matplotlib as mpl - +from cand_class.helper import * mpl.rc('figure', max_open_warning = 0) @@ -23,6 +23,9 @@ @dataclass class ApplyXGB: + x_train : pd.core.frame.DataFrame + x_test : pd.core.frame.DataFrame + y_pred_train : np.ndarray y_pred_test : np.ndarray @@ -31,61 +34,37 @@ class ApplyXGB: output_path : str + __train_res = pd.DataFrame() + __test_res = pd.DataFrame() - __bst_train : pd = pd.DataFrame() - __bst_test : pd = pd.DataFrame() - - __train_best_thr : int = 0 - __test_best_thr : int = 0 - - def apply_predictions(self): - - self.__bst_train["xgb_preds"] = self.y_pred_train - self.__bst_train['issignal'] = self.y_train - - - self.__bst_test["xgb_preds"] = self.y_pred_test - self.__bst_test['issignal'] = self.y_test - - return self.__bst_train, self.__bst_test - - - def get_threshold(self): + __best_train_thr : int = 0 + __best_test_thr : int = 0 - self.__train_best_thr, self.__test_best_thr, roc_curve_data = AMS(self.y_train, - self.__bst_train['xgb_preds'], self.y_test, self.__bst_test['xgb_preds'], - self.output_path) - return self.__train_best_thr, self.__test_best_thr + def get_predictions(self): + self.__best_train_thr, self.__best_test_thr, roc_curve_data = AMS(self.y_train, self.y_pred_train, + self.y_test, self.y_pred_test, self.output_path) + train_pred = ((self.y_pred_train > self.__best_train_thr)*1) + test_pred = ((self.y_pred_test > self.__best_test_thr)*1) - def apply_threshold(self): - cut_train = self.__train_best_thr - self.__train_pred = ((self.__bst_train['xgb_preds']>cut_train)*1) + self.__train_res = self.x_train.copy() + self.__train_res['xgb_preds1'] = train_pred - cut_test = self.__test_best_thr - self.__test_pred = ((self.__bst_test['xgb_preds']>cut_test)*1) + self.__test_res = self.x_test.copy() + self.__test_res['xgb_preds1'] = test_pred - return self.__train_pred, self.__test_pred - - - def get_result(self, x_train, x_test): - - train_with_preds = x_train.copy() - train_with_preds['xgb_preds1'] = self.__train_pred.values - - - test_with_preds = x_test.copy() - test_with_preds['xgb_preds1'] = self.__test_pred.values - - return train_with_preds, test_with_preds + return self.__train_res, self.__test_res def features_importance(self, bst): # this one needs to be tested ax = xgb.plot_importance(bst) plt.rcParams['figure.figsize'] = [6, 3] + plt.rcParams['font.size'] = 15 + ax.xaxis.set_tick_params(labelsize=13) + ax.yaxis.set_tick_params(labelsize=13) ax.figure.tight_layout() ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") @@ -110,23 +89,31 @@ def CM_plot_train_test(self): """ #lets take the best threshold and look at the confusion matrix - cnf_matrix_train = confusion_matrix(self.__bst_train['issignal'], self.__train_pred, labels=[1,0]) + cnf_matrix_train = confusion_matrix(self.__train_res['issignal'], self.__train_res['xgb_preds1'], labels=[1,0]) np.set_printoptions(precision=2) - fig_train, axs_train = plt.subplots(figsize=(10, 8)) + fig_train, axs_train = plt.subplots(figsize=(8, 6)) axs_train.yaxis.set_label_coords(-0.04,.5) axs_train.xaxis.set_label_coords(0.5,-.005) + + axs_train.xaxis.set_tick_params(labelsize=15) + axs_train.yaxis.set_tick_params(labelsize=15) + plot_confusion_matrix(cnf_matrix_train, classes=['signal','background'], - title=' Train Dataset Confusion Matrix for XGB for cut > '+str(self.__train_best_thr)) + title=' Train Dataset Confusion Matrix for cut > '+"%.4f"%self.__best_train_thr) plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_train.png') - cnf_matrix_test = confusion_matrix(self.__bst_test['issignal'], self.__test_pred, labels=[1,0]) + cnf_matrix_test = confusion_matrix(self.__test_res['issignal'], self.__test_res['xgb_preds1'], labels=[1,0]) np.set_printoptions(precision=2) - fig_test, axs_test = plt.subplots(figsize=(10, 8)) + fig_test, axs_test = plt.subplots(figsize=(8, 6)) axs_test.yaxis.set_label_coords(-0.04,.5) axs_test.xaxis.set_label_coords(0.5,-.005) + + axs_test.xaxis.set_tick_params(labelsize=15) + axs_test.yaxis.set_tick_params(labelsize=15) + plot_confusion_matrix(cnf_matrix_test, classes=['signal','background'], - title=' Test Dataset Confusion Matrix for XGB for cut > '+str(self.__test_best_thr)) + title=' Test Dataset Confusion Matrix for cut > '+"%.4f"%self.__best_test_thr) plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') From eae53ddf8983725eecc4da521eb8a5937f976ef3 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 22 Nov 2021 11:37:03 +0200 Subject: [PATCH 36/46] Corrected non-log variables return --- cand_class/config_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cand_class/config_reader.py b/cand_class/config_reader.py index a98e7df..24b47eb 100644 --- a/cand_class/config_reader.py +++ b/cand_class/config_reader.py @@ -43,7 +43,7 @@ def read_log_vars(inp_file): with open(str(inp_file), encoding="utf-8") as inp_file: inp_dict = tomli.load(inp_file) - return inp_dict['log_scale']['variables'], inp_dict['log_scale']['variables'] + return inp_dict['non_log_scale']['variables'], inp_dict['log_scale']['variables'] From dac89de180e5fd80424ac193e22fabb2f836895e Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 22 Nov 2021 11:38:15 +0200 Subject: [PATCH 37/46] Changed font sizes --- cand_class/MLconfig_variables.py | 56 +++++++++++++++++--------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/cand_class/MLconfig_variables.py b/cand_class/MLconfig_variables.py index 9221722..605c183 100644 --- a/cand_class/MLconfig_variables.py +++ b/cand_class/MLconfig_variables.py @@ -67,16 +67,16 @@ def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, path that contains output plot """ - fig, ax = plt.subplots(figsize=(20,10)) - plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='') - plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='') + fig, ax = plt.subplots(figsize=(10,6)) + plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='--o') + plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='--o') ax.grid(zorder=0) - ax.set_xticklabels(vars_to_draw, fontsize=30, rotation =70) + ax.set_xticklabels(vars_to_draw, fontsize=15, rotation =70) # ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) - ax.yaxis.set_tick_params(labelsize=30) - plt.legend(('signal','background'), fontsize = 25) - plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 30) - plt.ylabel('Correlation coefficient', fontsize = 30) + ax.yaxis.set_tick_params(labelsize=15) + plt.legend(('signal','background'), fontsize = 15) + plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 18) + plt.ylabel('Correlation coefficient', fontsize = 15) fig.tight_layout() fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png') @@ -119,7 +119,7 @@ def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): for var in df.columns: if var != variable_xaxis: - fig, axs = plt.subplots(figsize=(20, 15)) + fig, axs = plt.subplots(figsize=(10, 6)) bin_means, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='mean', bins=25) bin_std, bin_edges, binnumber = b_s(df[variable_xaxis],df[var], statistic='std', bins=25) @@ -134,20 +134,21 @@ def profile_mass(df,variable_xaxis, sign, peak, edge_left, edge_right, pdf_key): bin_std = np.delete(bin_std , nan_ind) - plt.errorbar(x=bin_centers, y=bin_means, yerr=(bin_std/np.sqrt(bin_count)), linestyle='none', marker='.',mfc='red', ms=10) + plt.errorbar(x=bin_centers, y=bin_means, yerr=(bin_std/np.sqrt(bin_count)), linestyle='none', linewidth = 2, marker='.',mfc='red', ms=15) + plt.locator_params(axis='y', nbins=5) + plt.locator_params(axis='x', nbins=5) + plt.title('Mean of ' +var+ ' vs bin centers of '+variable_xaxis+ \ + '('+keyword+')', fontsize=19) + plt.xlabel('Mass', fontsize=17) + plt.ylabel(" SEM ($\dfrac{bin\ std}{\sqrt{bin\ count}}$) of bin", fontsize=17) - plt.title('Mean of ' +var+ ' plotted versus bin centers of '+variable_xaxis+ \ - '('+keyword+')', fontsize=25) - plt.xlabel('Mass', fontsize=25) - plt.ylabel("Mean of each bin with the SEM ($\dfrac{bin\ std}{\sqrt{bin\ count}}$) of bin", fontsize=25) + plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-', linewidth = 3) - plt.vlines(x=peak,ymin=bin_means.min(),ymax=bin_means.max(), color='r', linestyle='-') - - axs.xaxis.set_tick_params(labelsize=20) - axs.yaxis.set_tick_params(labelsize=20) + axs.xaxis.set_tick_params(labelsize=16) + axs.yaxis.set_tick_params(labelsize=16) fig.tight_layout() plt.savefig(pdf_key,format='pdf') @@ -220,28 +221,31 @@ def plot2D_mass(df, sample, mass_var, mass_range, sgn, peak, pdf_key): for var in df.columns: if var != mass_var: - fig, axs = plt.subplots(figsize=(15, 10)) + fig, axs = plt.subplots(figsize=(6, 4)) cax = plt.hist2d(df[mass_var],df[var],range=[mass_range, [df[var].min(), df[var].max()]], bins=100, norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) if sgn==1: - plt.title('Signal candidates ' + sample, fontsize = 25) + plt.title('Signal candidates ' + sample, fontsize = 15) if sgn==0: - plt.title('Background candidates ' + sample, fontsize = 25) + plt.title('Background candidates ' + sample, fontsize = 15) - plt.xlabel(mass_var, fontsize=25) - plt.ylabel(var, fontsize=25) + plt.xlabel(mass_var, fontsize=16) + plt.ylabel(var, fontsize=16) - plt.vlines(x=peak,ymin=df[var].min(),ymax=df[var].max(), color='r', linestyle='-') + plt.vlines(x=peak,ymin=df[var].min(),ymax=df[var].max(), color='r', linestyle='-', linewidth = 4) mpl.pyplot.colorbar() - axs.xaxis.set_tick_params(labelsize=20) - axs.yaxis.set_tick_params(labelsize=20) + axs.xaxis.set_tick_params(labelsize=11) + axs.yaxis.set_tick_params(labelsize=11) + + plt.locator_params(axis='y', nbins=5) + plt.locator_params(axis='x', nbins=5) plt.legend(shadow=True,title =str(len(df))+ " samples") From e7bbc38fae89f60f7b6a6ddbf596c64cf01ecdf7 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Mon, 22 Nov 2021 11:40:25 +0200 Subject: [PATCH 38/46] corrected signal and background diff --- cand_class/helper.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/cand_class/helper.py b/cand_class/helper.py index 31a62c4..4f05cca 100644 --- a/cand_class/helper.py +++ b/cand_class/helper.py @@ -9,7 +9,6 @@ import treelite - def transform_df_to_log(df, vars, non_log_x, log_x): """ Transforms DataFrame to DataFrame with features in log scale @@ -84,17 +83,17 @@ def AMS(y_true, y_predict, y_true1, y_predict1, output_path): S0_best_threshold1 = (thresholds[xi1]) - fig, ax = plt.subplots(figsize=(12, 8), dpi = 100) + fig, ax = plt.subplots(figsize=(5, 4), dpi = 100) plt.plot(fpr, tpr, linewidth=3 ,linestyle=':',color='darkorange',label='ROC curve train (area = %0.6f)' % roc_auc) plt.plot(fpr1, tpr1, color='green',label='ROC curve test (area = %0.6f)' % roc_auc1) plt.plot([0, 1], [0, 1], color='navy', linestyle='--', label='Random guess') #plt.scatter(fpr[xi], tpr[xi], marker='o', color='black', label= 'Best Threshold train set = '+"%.4f" % S0_best_threshold +'\n AMS = '+ "%.2f" % S0[xi]) plt.scatter(fpr1[xi1], tpr1[xi1], marker='o', s=80, color='blue', label= 'Best Threshold test set = '+"%.4f" % S0_best_threshold1 +'\n AMS = '+ "%.2f" % S01[xi1]) - plt.xlabel('False Positive Rate', fontsize = 18) - plt.ylabel('True Positive Rate', fontsize = 18) - plt.legend(loc="lower right", fontsize = 18) - plt.title('Receiver operating characteristic', fontsize = 18) - ax.tick_params(axis='both', which='major', labelsize=18) + plt.xlabel('False Positive Rate', fontsize = 10) + plt.ylabel('True Positive Rate', fontsize = 10) + plt.legend(loc="lower right", fontsize = 8) + plt.title('Receiver operating characteristic', fontsize = 13) + ax.tick_params(axis='both', which='major', labelsize=10) plt.xlim([-0.01, 1.0]) plt.ylim([0, 1.02]) #axs.axis([-0.01, 1, 0.9, 1]) @@ -124,8 +123,11 @@ def plot_confusion_matrix(cm, classes, print(cm) + # plt.subplots(figsize=(12, 9)) + # plt.figure(figsize = (6, 5)) plt.imshow(cm, interpolation='nearest', cmap=cmap) - plt.title(title) + + plt.title(title, fontsize = 20) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) @@ -136,7 +138,7 @@ def plot_confusion_matrix(cm, classes, for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", - color="white" if cm[i, j] > thresh else "black") + color="white" if cm[i, j] > thresh else "black", size = 15) plt.tight_layout() plt.ylabel('True label',fontsize = 15) @@ -191,7 +193,7 @@ def difference_df(df_orig, df_cut, cut): def diff_SB_cut(df, target_label): - dfs_cut = df[(df['xgb_preds1']==1) & (df[target_label]==1)] + dfs_cut = df[(df['xgb_preds1']==1)] dfb_cut = df[(df['xgb_preds1']==1) & (df[target_label]==0)] return dfs_cut, dfb_cut From 02478e1b4d8f36ad3eab1b7242426b9e4a2a0fea Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 3 Dec 2021 12:55:28 +0200 Subject: [PATCH 39/46] hist_variables requires dataframe and labels and computes signal and background difference inside the function --- cand_class/apply_model.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py index ee67826..888f183 100644 --- a/cand_class/apply_model.py +++ b/cand_class/apply_model.py @@ -202,8 +202,7 @@ def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, da - - def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, sample, pdf_key): + def hist_variables(self, mass_var, df, sign_label, pred_label, sample, pdf_key): """ Applied quality cuts and created distributions for all the features in pdf file @@ -219,10 +218,20 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe name of pdf document with distributions """ + dfs_orig = df[df[sign_label]==1] + dfb_orig = df[df[sign_label]==0] + + dfs_cut = df[(df[sign_label]==1) & (df[pred_label]==1)] + dfb_cut = df[(df[sign_label]==0) & (df[pred_label]==1)] + + diff_vars = dfs_orig.columns.drop([sign_label, pred_label]) + + difference_s = pd.concat([dfs_orig[diff_vars], dfs_cut[diff_vars]]).drop_duplicates(keep=False) - for feature in dfs_orig.columns: - fig, ax = plt.subplots(3, figsize=(20, 10)) + + for feature in diff_vars: + fig, ax = plt.subplots(3, figsize=(15, 10)) fontP = FontProperties() @@ -251,9 +260,17 @@ def hist_variables(self, mass_var, dfs_orig, dfb_orig, dfs_cut, dfb_cut, differe fig.tight_layout() + if len(dfb_cut) !=0: + s_b_cut = round(len(dfs_cut)/len(dfb_cut), 3) + title1 = 'S/B='+ str(s_b_cut) + else: + title1 = 'S = '+str(len(dfs_cut)) + ' all bgr was cut' + + + ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') - ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + + ax[1].legend(shadow=True,title = title1 + '\n S samples: '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) + '\nquality cuts + ML cut', title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), From 0d5cdd6092d8e7bc5c36a06c2cfe8686be922397 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 3 Dec 2021 12:57:36 +0200 Subject: [PATCH 40/46] Changed the way of log transformation --- cand_class/helper.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cand_class/helper.py b/cand_class/helper.py index 4f05cca..2f85f52 100644 --- a/cand_class/helper.py +++ b/cand_class/helper.py @@ -22,15 +22,12 @@ def transform_df_to_log(df, vars, non_log_x, log_x): config TOML file with list of features that should and shouldn't be transformed to log scale """ - df_new = pd.DataFrame() - - + df_new = df.copy() for var in vars: if var in log_x: - df_new['log('+ var+')'] = np.log(df[var]) - if var in non_log_x: - df_new[var] = df[var] + df_new[var] = df_new[var].apply(np.log) + df_new = df_new.rename(columns={var: 'log('+ var+')'}) return df_new From af8cf1e553cbe7d995f2261570b50207fc15e5ca Mon Sep 17 00:00:00 2001 From: conformist89 Date: Fri, 3 Dec 2021 16:29:20 +0200 Subject: [PATCH 41/46] Only one dataframe is required --- cand_class/apply_model.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py index 888f183..8fb10e3 100644 --- a/cand_class/apply_model.py +++ b/cand_class/apply_model.py @@ -117,18 +117,18 @@ def CM_plot_train_test(self): plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') - - def pT_vs_rapidity(self, df_orig, df_cut, difference, sign, x_range, y_range, data_name): + def pT_vs_rapidity(self, df, sign_label, pred_label, x_range, y_range, data_name): fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) + df_orig = df[df[sign_label]==1] + + df_cut = df[df[pred_label]==1] + + diff_vars = ['pT', 'rapidity'] - if sign ==0: - s_label = 'Background' - m = 5 + difference = pd.concat([df_orig[diff_vars], df_cut[diff_vars]]).drop_duplicates(keep=False) - if sign==1: - s_label = 'Signal' - m = 1 + s_label = 'Signal ' axs[0].set_aspect(aspect = 'auto') axs[1].set_aspect(aspect = 'auto') From 7783a233b78dfb28af695a5ad0b471dc77667229 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Thu, 17 Feb 2022 16:11:37 +0200 Subject: [PATCH 42/46] Added comments to ApplyXGB members --- cand_class/apply_model.py | 116 +++++++++++++++++++++++++++++++------- 1 file changed, 95 insertions(+), 21 deletions(-) diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py index 8fb10e3..b394b09 100644 --- a/cand_class/apply_model.py +++ b/cand_class/apply_model.py @@ -22,6 +22,31 @@ @dataclass class ApplyXGB: + """ + A class used to apply XGBoost predictions on the data + + ... + + Attributes + ---------- + x_train : pd.core.frame.DataFrame + train dataframe + x_test : pd.core.frame.DataFrame + test dataframe + y_pred_train : np.ndarray + array with XGBoost predictions for train dataset + y_pred_test : np.ndarray + array with XGBoost predictions for test dataset + output_path : str + output path for plots + + Methods + ------- + get_predictions() + Returns train and test dataframes with predictions + features_importance(bst) + + """ x_train : pd.core.frame.DataFrame x_test : pd.core.frame.DataFrame @@ -43,6 +68,8 @@ class ApplyXGB: def get_predictions(self): + """ + """ self.__best_train_thr, self.__best_test_thr, roc_curve_data = AMS(self.y_train, self.y_pred_train, self.y_test, self.y_pred_test, self.output_path) @@ -59,17 +86,36 @@ def get_predictions(self): def features_importance(self, bst): - # this one needs to be tested - ax = xgb.plot_importance(bst) - plt.rcParams['figure.figsize'] = [6, 3] - plt.rcParams['font.size'] = 15 - ax.xaxis.set_tick_params(labelsize=13) - ax.yaxis.set_tick_params(labelsize=13) - ax.figure.tight_layout() - ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") + """ + Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to + the number of observations known to be in group i and predicted to be in + group j. Thus in binary classification, the count of true positives is C00, + false negatives C01,false positives is C10, and true neagtives is C11. + + Confusion matrix is applied to previously unseen by model data, so we can + estimate model's performance + + Parameters + ---------- + bst: xgboost.sklearn.XGBClassifier + model's XGB classifier + + Returns + ------- + + Saves plot with XGB features imporance + + """ + ax = xgb.plot_importance(bst) + plt.rcParams['figure.figsize'] = [6, 3] + plt.rcParams['font.size'] = 15 + ax.xaxis.set_tick_params(labelsize=13) + ax.yaxis.set_tick_params(labelsize=13) + ax.figure.tight_layout() + ax.figure.savefig(str(self.output_path)+"/xgb_train_variables_rank.png") - def CM_plot_train_test(self): + def CM_plot_train_test(self, issignal): """ Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to the number of observations known to be in group i and predicted to be in @@ -81,15 +127,17 @@ def CM_plot_train_test(self): Parameters ---------- - test_best: numpy.float32 - best threshold + issignal: str + signal label + + Returns + ------- + + Saves plot with confusion matrix - x_train: dataframe - we want to get confusion matrix on training datasets """ - #lets take the best threshold and look at the confusion matrix - cnf_matrix_train = confusion_matrix(self.__train_res['issignal'], self.__train_res['xgb_preds1'], labels=[1,0]) + cnf_matrix_train = confusion_matrix(self.__train_res[issignal], self.__train_res['xgb_preds1'], labels=[1,0]) np.set_printoptions(precision=2) fig_train, axs_train = plt.subplots(figsize=(8, 6)) axs_train.yaxis.set_label_coords(-0.04,.5) @@ -103,7 +151,7 @@ def CM_plot_train_test(self): plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_train.png') - cnf_matrix_test = confusion_matrix(self.__test_res['issignal'], self.__test_res['xgb_preds1'], labels=[1,0]) + cnf_matrix_test = confusion_matrix(self.__test_res[issignal], self.__test_res['xgb_preds1'], labels=[1,0]) np.set_printoptions(precision=2) fig_test, axs_test = plt.subplots(figsize=(8, 6)) axs_test.yaxis.set_label_coords(-0.04,.5) @@ -117,14 +165,40 @@ def CM_plot_train_test(self): plt.savefig(str(self.output_path)+'/confusion_matrix_extreme_gradient_boosting_test.png') - def pT_vs_rapidity(self, df, sign_label, pred_label, x_range, y_range, data_name): + def pT_vs_rapidity(self, df, sign_label, pred_label, x_range, y_range, data_name, pt_rap): + """ + Plots distribution in pT-rapidity phase space + + Parameters + ---------- + df: pd.DataFrame + dataframe with XGBoost predictions + sign_label: str + dataframe column that srecifies if the sample is signal or not + pred_label: str + dataframe column with XGBoost prediction + x_range: list + rapidity range + y_range: list + pT range + data_name: str + name of the dataset (for example, train or test) + pt_rap: list + list with pt and rapidity labels(for example ['pt', 'rapid']) + + Returns + ------- + + Saves istribution in pT-rapidity phase space + + """ fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) df_orig = df[df[sign_label]==1] df_cut = df[df[pred_label]==1] - diff_vars = ['pT', 'rapidity'] + diff_vars = pt_rap difference = pd.concat([df_orig[diff_vars], df_cut[diff_vars]]).drop_duplicates(keep=False) @@ -143,7 +217,7 @@ def pT_vs_rapidity(self, df, sign_label, pred_label, x_range, y_range, data_name - counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100, + counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig[pt_rap[1]], df_orig[pt_rap[0]] , range = [x_range, y_range], bins=100, norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16) @@ -164,7 +238,7 @@ def pT_vs_rapidity(self, df, sign_label, pred_label, x_range, y_range, data_name fig.tight_layout() - counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut['rapidity'], df_cut['pT'] , range = [x_range, y_range], bins=100, + counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut[pt_rap[1]], df_cut[pt_rap[0]] , range = [x_range, y_range], bins=100, norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16) @@ -181,7 +255,7 @@ def pT_vs_rapidity(self, df, sign_label, pred_label, x_range, y_range, data_name fig.tight_layout() - counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100, + counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference[pt_rap[1]], difference[pt_rap[0]] , range = [x_range, y_range], bins=100, norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) axs[2].set_title(s_label + ' difference ', fontsize = 16) From cfef579966c6fc55c4365f3fda3c80966c9bffac Mon Sep 17 00:00:00 2001 From: conformist89 Date: Thu, 17 Feb 2022 18:43:55 +0200 Subject: [PATCH 43/46] One can adjust theshold manually or use optized AMS --- cand_class/apply_model.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py index b394b09..99c817b 100644 --- a/cand_class/apply_model.py +++ b/cand_class/apply_model.py @@ -67,11 +67,22 @@ class ApplyXGB: - def get_predictions(self): + def get_predictions(self, ams, train_thr, test_thr): """ + Makes XGBoost predictions + + Returns + ------- + + Train and test dataframes with predictions """ - self.__best_train_thr, self.__best_test_thr, roc_curve_data = AMS(self.y_train, self.y_pred_train, - self.y_test, self.y_pred_test, self.output_path) + if ams==1: + self.__best_train_thr, self.__best_test_thr, roc_curve_data = AMS(self.y_train, self.y_pred_train, + self.y_test, self.y_pred_test, self.output_path) + + if ams==0 and train_thr!=0 and test_thr!=0: + self.__best_train_thr = train_thr + self.__best_test_thr = test_thr train_pred = ((self.y_pred_train > self.__best_train_thr)*1) test_pred = ((self.y_pred_test > self.__best_test_thr)*1) From b2834b68d1be7e78668b15326017f6d9ae7ca81e Mon Sep 17 00:00:00 2001 From: conformist89 Date: Wed, 23 Feb 2022 22:33:43 +0200 Subject: [PATCH 44/46] get_predictions returns XGBoost BDT predictions, apply_prob_cut returns result predicted target of the sample with respect to the threshold --- cand_class/apply_model.py | 56 +++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py index 99c817b..2cb0793 100644 --- a/cand_class/apply_model.py +++ b/cand_class/apply_model.py @@ -67,15 +67,61 @@ class ApplyXGB: - def get_predictions(self, ams, train_thr, test_thr): + # def get_predictions(self, ams, train_thr, test_thr): + # """ + # Makes XGBoost predictions + # + # Returns + # ------- + # + # Train and test dataframes with predictions + # """ + # if ams==1: + # self.__best_train_thr, self.__best_test_thr, roc_curve_data = AMS(self.y_train, self.y_pred_train, + # self.y_test, self.y_pred_test, self.output_path) + # + # if ams==0 and train_thr!=0 and test_thr!=0: + # self.__best_train_thr = train_thr + # self.__best_test_thr = test_thr + # + # train_pred = ((self.y_pred_train > self.__best_train_thr)*1) + # test_pred = ((self.y_pred_test > self.__best_test_thr)*1) + # + # self.__train_res = self.x_train.copy() + # self.__train_res['xgb_preds1'] = train_pred + # + # self.__test_res = self.x_test.copy() + # self.__test_res['xgb_preds1'] = test_pred + # + # return self.__train_res, self.__test_res + + def get_predictions(self): """ - Makes XGBoost predictions + Makes XGB predictions Returns - ------- + -------- Train and test dataframes with predictions """ + self.__train_res = self.x_train.copy() + self.__train_res['xgb_preds'] = self.y_pred_train + + self.__test_res = self.x_test.copy() + self.__test_res['xgb_preds'] = self.y_pred_test + + return self.__train_res, self.__test_res + + + def apply_prob_cut(self, ams, train_thr, test_thr): + """ + Applies BDT cut on XGB probabilities and returns 'xgb_preds1' ==1 if + sample's prediction > BDT threshold, otherwise 'xgb_preds1' ==0 + If ams==1 BDT cut is computed with respect to AMS metrics optimization + If ams==0 and train_thr!=0 and test_thr!=0, one should specify + thresholds for test and train datasets manually + """ + if ams==1: self.__best_train_thr, self.__best_test_thr, roc_curve_data = AMS(self.y_train, self.y_pred_train, self.y_test, self.y_pred_test, self.output_path) @@ -87,15 +133,15 @@ def get_predictions(self, ams, train_thr, test_thr): train_pred = ((self.y_pred_train > self.__best_train_thr)*1) test_pred = ((self.y_pred_test > self.__best_test_thr)*1) - self.__train_res = self.x_train.copy() + self.__train_res['xgb_preds1'] = train_pred - self.__test_res = self.x_test.copy() self.__test_res['xgb_preds1'] = test_pred return self.__train_res, self.__test_res + def features_importance(self, bst): """ Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to From f49e08c41bc1d97020b6c2251eb91f2b1a47e58f Mon Sep 17 00:00:00 2001 From: conformist89 Date: Wed, 23 Feb 2022 23:45:26 +0200 Subject: [PATCH 45/46] Added roc curve plot --- cand_class/apply_model.py | 37 ++++++++----------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/cand_class/apply_model.py b/cand_class/apply_model.py index 2cb0793..146deb8 100644 --- a/cand_class/apply_model.py +++ b/cand_class/apply_model.py @@ -16,6 +16,7 @@ import gc import matplotlib as mpl from cand_class.helper import * +from hipe4ml import plot_utils mpl.rc('figure', max_open_warning = 0) @@ -66,35 +67,6 @@ class ApplyXGB: __best_test_thr : int = 0 - - # def get_predictions(self, ams, train_thr, test_thr): - # """ - # Makes XGBoost predictions - # - # Returns - # ------- - # - # Train and test dataframes with predictions - # """ - # if ams==1: - # self.__best_train_thr, self.__best_test_thr, roc_curve_data = AMS(self.y_train, self.y_pred_train, - # self.y_test, self.y_pred_test, self.output_path) - # - # if ams==0 and train_thr!=0 and test_thr!=0: - # self.__best_train_thr = train_thr - # self.__best_test_thr = test_thr - # - # train_pred = ((self.y_pred_train > self.__best_train_thr)*1) - # test_pred = ((self.y_pred_test > self.__best_test_thr)*1) - # - # self.__train_res = self.x_train.copy() - # self.__train_res['xgb_preds1'] = train_pred - # - # self.__test_res = self.x_test.copy() - # self.__test_res['xgb_preds1'] = test_pred - # - # return self.__train_res, self.__test_res - def get_predictions(self): """ Makes XGB predictions @@ -141,6 +113,13 @@ def apply_prob_cut(self, ams, train_thr, test_thr): return self.__train_res, self.__test_res + def print_roc(self): + plot_utils.plot_roc_train_test(self.y_test, self.__test_res['xgb_preds1'], + self.y_train, self.__train_res['xgb_preds1'], None, ['background', 'signal']) + plt.savefig(str(self.output_path)+'/roc_curve.png') + + + def features_importance(self, bst): """ From 51e16a9dfd66db962c48aac5ba99d21cc4ddafe9 Mon Sep 17 00:00:00 2001 From: conformist89 Date: Wed, 23 Feb 2022 23:47:25 +0200 Subject: [PATCH 46/46] Deleted roc curve plot from AMS function, one can only compute test and train thresholds with respect to AMS --- cand_class/helper.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/cand_class/helper.py b/cand_class/helper.py index 2f85f52..9b4aa7c 100644 --- a/cand_class/helper.py +++ b/cand_class/helper.py @@ -80,24 +80,6 @@ def AMS(y_true, y_predict, y_true1, y_predict1, output_path): S0_best_threshold1 = (thresholds[xi1]) - fig, ax = plt.subplots(figsize=(5, 4), dpi = 100) - plt.plot(fpr, tpr, linewidth=3 ,linestyle=':',color='darkorange',label='ROC curve train (area = %0.6f)' % roc_auc) - plt.plot(fpr1, tpr1, color='green',label='ROC curve test (area = %0.6f)' % roc_auc1) - plt.plot([0, 1], [0, 1], color='navy', linestyle='--', label='Random guess') - #plt.scatter(fpr[xi], tpr[xi], marker='o', color='black', label= 'Best Threshold train set = '+"%.4f" % S0_best_threshold +'\n AMS = '+ "%.2f" % S0[xi]) - plt.scatter(fpr1[xi1], tpr1[xi1], marker='o', s=80, color='blue', label= 'Best Threshold test set = '+"%.4f" % S0_best_threshold1 +'\n AMS = '+ "%.2f" % S01[xi1]) - plt.xlabel('False Positive Rate', fontsize = 10) - plt.ylabel('True Positive Rate', fontsize = 10) - plt.legend(loc="lower right", fontsize = 8) - plt.title('Receiver operating characteristic', fontsize = 13) - ax.tick_params(axis='both', which='major', labelsize=10) - plt.xlim([-0.01, 1.0]) - plt.ylim([0, 1.02]) - #axs.axis([-0.01, 1, 0.9, 1]) - fig.tight_layout() - fig.savefig(str(output_path)+'/hists.png') - plt.show() - roc_curve_data = dict() roc_curve_data["fpr_train"] = fpr roc_curve_data["tpr_train"] = tpr